From 477fed284dcc482e8f299e8b75906933dd5e8774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sascha=20I=C3=9Fbr=C3=BCcker?= Date: Tue, 29 Dec 2020 13:14:10 +0100 Subject: [PATCH] #47 Fix description import (#48) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sascha Ißbrücker --- bookmarks/services/importer.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/bookmarks/services/importer.py b/bookmarks/services/importer.py index 1b7a8b8..6919b23 100644 --- a/bookmarks/services/importer.py +++ b/bookmarks/services/importer.py @@ -51,6 +51,7 @@ def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User): # Either modify existing bookmark for the URL or create new one url = link_tag['href'] + description = _extract_description(bookmark_tag) bookmark = _get_or_create_bookmark(url, user) bookmark.url = url @@ -59,6 +60,8 @@ def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User): bookmark.date_modified = bookmark.date_added bookmark.unread = link_tag.get('toread', '0') == '1' bookmark.title = link_tag.string + if description: + bookmark.description = description bookmark.owner = user bookmark.save() @@ -77,3 +80,27 @@ def _get_or_create_bookmark(url: str, user: User): return Bookmark.objects.get(url=url, owner=user) except Bookmark.DoesNotExist: return Bookmark() + + +def _extract_description(bookmark_tag: bs4.Tag): + """ + Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag + so to extract the description text we have to get creative. For now we combine the text of all text nodes until we + detect a
tag which indicates a new bookmark + :param bookmark_tag: + :return: + """ + description_tag = bookmark_tag.find('dd', recursive=False) + + if description_tag is None: + return None + + description = '' + + for content in description_tag.contents: + if type(content) is bs4.element.Tag and content.name == 'dt': + break + if type(content) is bs4.element.NavigableString: + description += content + + return description.strip()