2020-06-07 12:15:44 +00:00
|
|
|
import logging
|
|
|
|
from dataclasses import dataclass
|
2019-06-29 06:42:54 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
2019-06-30 05:15:46 +00:00
|
|
|
import bs4
|
|
|
|
from bs4 import BeautifulSoup
|
2019-06-29 06:42:54 +00:00
|
|
|
from django.contrib.auth.models import User
|
|
|
|
|
2019-07-01 20:05:38 +00:00
|
|
|
from bookmarks.models import Bookmark, parse_tag_string
|
2019-07-03 15:18:29 +00:00
|
|
|
from bookmarks.services.tags import get_or_create_tags
|
2019-06-29 06:42:54 +00:00
|
|
|
|
2020-06-07 12:15:44 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class ImportResult:
|
|
|
|
total: int = 0
|
|
|
|
success: int = 0
|
|
|
|
failed: int = 0
|
|
|
|
|
2019-06-29 06:42:54 +00:00
|
|
|
|
|
|
|
def import_netscape_html(html: str, user: User):
|
2020-06-07 12:15:44 +00:00
|
|
|
result = ImportResult()
|
|
|
|
|
|
|
|
try:
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
except:
|
|
|
|
logging.exception('Could not read bookmarks file.')
|
|
|
|
raise
|
2019-06-29 06:42:54 +00:00
|
|
|
|
|
|
|
bookmark_tags = soup.find_all('dt')
|
|
|
|
|
|
|
|
for bookmark_tag in bookmark_tags:
|
2020-06-07 12:15:44 +00:00
|
|
|
result.total = result.total + 1
|
|
|
|
try:
|
|
|
|
_import_bookmark_tag(bookmark_tag, user)
|
|
|
|
result.success = result.success + 1
|
|
|
|
except:
|
|
|
|
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
|
|
|
|
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
|
|
|
result.failed = result.failed + 1
|
|
|
|
|
|
|
|
return result
|
2019-06-29 06:42:54 +00:00
|
|
|
|
|
|
|
|
2019-06-30 05:15:46 +00:00
|
|
|
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
|
2019-06-29 06:42:54 +00:00
|
|
|
link_tag = bookmark_tag.a
|
|
|
|
|
|
|
|
if link_tag is None:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Either modify existing bookmark for the URL or create new one
|
|
|
|
url = link_tag['href']
|
2020-12-29 12:14:10 +00:00
|
|
|
description = _extract_description(bookmark_tag)
|
2019-06-29 06:42:54 +00:00
|
|
|
bookmark = _get_or_create_bookmark(url, user)
|
|
|
|
|
|
|
|
bookmark.url = url
|
2020-06-06 17:28:43 +00:00
|
|
|
add_date = link_tag.get('add_date', datetime.now().timestamp())
|
|
|
|
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
|
2019-06-29 06:42:54 +00:00
|
|
|
bookmark.date_modified = bookmark.date_added
|
2020-06-06 17:27:43 +00:00
|
|
|
bookmark.unread = link_tag.get('toread', '0') == '1'
|
2019-06-29 06:42:54 +00:00
|
|
|
bookmark.title = link_tag.string
|
2020-12-29 12:14:10 +00:00
|
|
|
if description:
|
|
|
|
bookmark.description = description
|
2019-06-29 06:42:54 +00:00
|
|
|
bookmark.owner = user
|
|
|
|
|
|
|
|
bookmark.save()
|
|
|
|
|
2019-06-30 05:15:46 +00:00
|
|
|
# Set tags
|
2020-06-06 17:27:43 +00:00
|
|
|
tag_string = link_tag.get('tags', '')
|
2019-07-01 20:05:38 +00:00
|
|
|
tag_names = parse_tag_string(tag_string)
|
|
|
|
tags = get_or_create_tags(tag_names, user)
|
2019-06-30 05:15:46 +00:00
|
|
|
|
|
|
|
bookmark.tags.set(tags)
|
|
|
|
bookmark.save()
|
|
|
|
|
2019-06-29 06:42:54 +00:00
|
|
|
|
|
|
|
def _get_or_create_bookmark(url: str, user: User):
|
|
|
|
try:
|
|
|
|
return Bookmark.objects.get(url=url, owner=user)
|
|
|
|
except Bookmark.DoesNotExist:
|
|
|
|
return Bookmark()
|
2020-12-29 12:14:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _extract_description(bookmark_tag: bs4.Tag):
|
|
|
|
"""
|
|
|
|
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
|
|
|
|
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
|
|
|
|
detect a <dt> tag which indicates a new bookmark
|
|
|
|
:param bookmark_tag:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
description_tag = bookmark_tag.find('dd', recursive=False)
|
|
|
|
|
|
|
|
if description_tag is None:
|
|
|
|
return None
|
|
|
|
|
|
|
|
description = ''
|
|
|
|
|
|
|
|
for content in description_tag.contents:
|
|
|
|
if type(content) is bs4.element.Tag and content.name == 'dt':
|
|
|
|
break
|
|
|
|
if type(content) is bs4.element.NavigableString:
|
|
|
|
description += content
|
|
|
|
|
|
|
|
return description.strip()
|