linkding/bookmarks/services/parser.py
Sascha Ißbrücker 779de41b65
Implement custom netscape file parser (#51)
* Implement custom Netscape file parser (#50)

* Add environment variable to configure request timeouts (#50)

Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
2020-12-31 07:02:28 +01:00

74 lines
2.0 KiB
Python

from dataclasses import dataclass
from datetime import datetime
import pyparsing as pp
@dataclass
class NetscapeBookmark:
href: str
title: str
description: str
date_added: int
tag_string: str
def extract_bookmark_link(tag):
href = tag[0].href
title = tag[0].text
tag_string = tag[0].tags
date_added_string = tag[0].add_date if tag[0].add_date else datetime.now().timestamp()
date_added = int(date_added_string)
return {
'href': href,
'title': title,
'tag_string': tag_string,
'date_added': date_added
}
def extract_bookmark(tag):
link = tag[0].link
description = tag[0].description
description = description[0] if description else ''
return {
'link': link,
'description': description,
}
def extract_description(tag):
return tag[0].strip()
# define grammar
dt_start, _ = pp.makeHTMLTags("DT")
dd_start, _ = pp.makeHTMLTags("DD")
a_start, a_end = pp.makeHTMLTags("A")
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
bookmark_link_tag.addParseAction(extract_bookmark_link)
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
bookmark_description_tag.addParseAction(extract_description)
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
bookmark_tag.addParseAction(extract_bookmark)
def parse(html: str) -> [NetscapeBookmark]:
matches = bookmark_tag.searchString(html)
bookmarks = []
for match in matches:
bookmark_match = match[0]
bookmark = NetscapeBookmark(
href=bookmark_match['link']['href'],
title=bookmark_match['link']['title'],
description=bookmark_match['description'],
tag_string=bookmark_match['link']['tag_string'],
date_added=bookmark_match['link']['date_added'],
)
bookmarks.append(bookmark)
return bookmarks