linkding/bookmarks/services/importer.py

107 lines
3.0 KiB
Python
Raw Normal View History

import logging
from dataclasses import dataclass
2019-06-29 06:42:54 +00:00
from datetime import datetime
2019-06-30 05:15:46 +00:00
import bs4
from bs4 import BeautifulSoup
2019-06-29 06:42:54 +00:00
from django.contrib.auth.models import User
2019-07-01 20:05:38 +00:00
from bookmarks.models import Bookmark, parse_tag_string
2019-07-03 15:18:29 +00:00
from bookmarks.services.tags import get_or_create_tags
2019-06-29 06:42:54 +00:00
logger = logging.getLogger(__name__)
@dataclass
class ImportResult:
total: int = 0
success: int = 0
failed: int = 0
2019-06-29 06:42:54 +00:00
def import_netscape_html(html: str, user: User):
result = ImportResult()
try:
soup = BeautifulSoup(html, 'html.parser')
except:
logging.exception('Could not read bookmarks file.')
raise
2019-06-29 06:42:54 +00:00
bookmark_tags = soup.find_all('dt')
for bookmark_tag in bookmark_tags:
result.total = result.total + 1
try:
_import_bookmark_tag(bookmark_tag, user)
result.success = result.success + 1
except:
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
result.failed = result.failed + 1
return result
2019-06-29 06:42:54 +00:00
2019-06-30 05:15:46 +00:00
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
2019-06-29 06:42:54 +00:00
link_tag = bookmark_tag.a
if link_tag is None:
return
# Either modify existing bookmark for the URL or create new one
url = link_tag['href']
description = _extract_description(bookmark_tag)
2019-06-29 06:42:54 +00:00
bookmark = _get_or_create_bookmark(url, user)
bookmark.url = url
add_date = link_tag.get('add_date', datetime.now().timestamp())
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
2019-06-29 06:42:54 +00:00
bookmark.date_modified = bookmark.date_added
bookmark.unread = link_tag.get('toread', '0') == '1'
2019-06-29 06:42:54 +00:00
bookmark.title = link_tag.string
if description:
bookmark.description = description
2019-06-29 06:42:54 +00:00
bookmark.owner = user
bookmark.save()
2019-06-30 05:15:46 +00:00
# Set tags
tag_string = link_tag.get('tags', '')
2019-07-01 20:05:38 +00:00
tag_names = parse_tag_string(tag_string)
tags = get_or_create_tags(tag_names, user)
2019-06-30 05:15:46 +00:00
bookmark.tags.set(tags)
bookmark.save()
2019-06-29 06:42:54 +00:00
def _get_or_create_bookmark(url: str, user: User):
try:
return Bookmark.objects.get(url=url, owner=user)
except Bookmark.DoesNotExist:
return Bookmark()
def _extract_description(bookmark_tag: bs4.Tag):
"""
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
detect a <dt> tag which indicates a new bookmark
:param bookmark_tag:
:return:
"""
description_tag = bookmark_tag.find('dd', recursive=False)
if description_tag is None:
return None
description = ''
for content in description_tag.contents:
if type(content) is bs4.element.Tag and content.name == 'dt':
break
if type(content) is bs4.element.NavigableString:
description += content
return description.strip()