linkding/bookmarks/services/importer.py

import logging
from dataclasses import dataclass
from datetime import datetime

import bs4
from bs4 import BeautifulSoup
from django.contrib.auth.models import User

from bookmarks.models import Bookmark, parse_tag_string
from bookmarks.services.tags import get_or_create_tags

logger = logging.getLogger(__name__)


@dataclass
class ImportResult:
    total: int = 0
    success: int = 0
    failed: int = 0


def import_netscape_html(html: str, user: User):
    result = ImportResult()

    try:
        soup = BeautifulSoup(html, 'html.parser')
    except:
        logging.exception('Could not read bookmarks file.')
        raise

    bookmark_tags = soup.find_all('dt')

    for bookmark_tag in bookmark_tags:
        result.total = result.total + 1
        try:
            _import_bookmark_tag(bookmark_tag, user)
            result.success = result.success + 1
        except:
            shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
            logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
            result.failed = result.failed + 1

    return result


def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
    link_tag = bookmark_tag.a

    if link_tag is None:
        return

    # Either modify existing bookmark for the URL or create new one
    url = link_tag['href']
    description = _extract_description(bookmark_tag)
    bookmark = _get_or_create_bookmark(url, user)

    bookmark.url = url
    add_date = link_tag.get('add_date', datetime.now().timestamp())
    bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
    bookmark.date_modified = bookmark.date_added
    bookmark.unread = link_tag.get('toread', '0') == '1'
    bookmark.title = link_tag.string
    if description:
        bookmark.description = description
    bookmark.owner = user

    bookmark.save()

    # Set tags
    tag_string = link_tag.get('tags', '')
    tag_names = parse_tag_string(tag_string)
    tags = get_or_create_tags(tag_names, user)

    bookmark.tags.set(tags)
    bookmark.save()


def _get_or_create_bookmark(url: str, user: User):
    try:
        return Bookmark.objects.get(url=url, owner=user)
    except Bookmark.DoesNotExist:
        return Bookmark()


def _extract_description(bookmark_tag: bs4.Tag):
    """
    Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
    so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
    detect a <dt> tag which indicates a new bookmark
    :param bookmark_tag:
    :return:
    """
    description_tag = bookmark_tag.find('dd', recursive=False)

    if description_tag is None:
        return None

    description = ''

    for content in description_tag.contents:
        if type(content) is bs4.element.Tag and content.name == 'dt':
            break
        if type(content) is bs4.element.NavigableString:
            description += content

    return description.strip()
Add error handling and logging for importer 2020-06-07 12:15:44 +00:00			`import logging`
			`from dataclasses import dataclass`
Implement basic importer 2019-06-29 06:42:54 +00:00			`from datetime import datetime`

Implement tag model 2019-06-30 05:15:46 +00:00			`import bs4`
			`from bs4 import BeautifulSoup`
Implement basic importer 2019-06-29 06:42:54 +00:00			`from django.contrib.auth.models import User`

Edit bookmark tags 2019-07-01 20:05:38 +00:00			`from bookmarks.models import Bookmark, parse_tag_string`
Create docker image 2019-07-03 15:18:29 +00:00			`from bookmarks.services.tags import get_or_create_tags`
Implement basic importer 2019-06-29 06:42:54 +00:00
Add error handling and logging for importer 2020-06-07 12:15:44 +00:00			`logger = logging.getLogger(__name__)`


			`@dataclass`
			`class ImportResult:`
			`total: int = 0`
			`success: int = 0`
			`failed: int = 0`

Implement basic importer 2019-06-29 06:42:54 +00:00
			`def import_netscape_html(html: str, user: User):`
Add error handling and logging for importer 2020-06-07 12:15:44 +00:00			`result = ImportResult()`

			`try:`
			`soup = BeautifulSoup(html, 'html.parser')`
			`except:`
			`logging.exception('Could not read bookmarks file.')`
			`raise`
Implement basic importer 2019-06-29 06:42:54 +00:00
			`bookmark_tags = soup.find_all('dt')`

			`for bookmark_tag in bookmark_tags:`
Add error handling and logging for importer 2020-06-07 12:15:44 +00:00			`result.total = result.total + 1`
			`try:`
			`_import_bookmark_tag(bookmark_tag, user)`
			`result.success = result.success + 1`
			`except:`
			`shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'`
			`logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)`
			`result.failed = result.failed + 1`

			`return result`
Implement basic importer 2019-06-29 06:42:54 +00:00

Implement tag model 2019-06-30 05:15:46 +00:00			`def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):`
Implement basic importer 2019-06-29 06:42:54 +00:00			`link_tag = bookmark_tag.a`

			`if link_tag is None:`
			`return`

			`# Either modify existing bookmark for the URL or create new one`
			`url = link_tag['href']`
#47 Fix description import (#48) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io> 2020-12-29 12:14:10 +00:00			`description = _extract_description(bookmark_tag)`
Implement basic importer 2019-06-29 06:42:54 +00:00			`bookmark = _get_or_create_bookmark(url, user)`

			`bookmark.url = url`
Use system timezone for bookmark's "Added date" 2020-06-06 17:28:43 +00:00			`add_date = link_tag.get('add_date', datetime.now().timestamp())`
			`bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()`
Implement basic importer 2019-06-29 06:42:54 +00:00			`bookmark.date_modified = bookmark.date_added`
Use a default value if attribute missing from tag 2020-06-06 17:27:43 +00:00			`bookmark.unread = link_tag.get('toread', '0') == '1'`
Implement basic importer 2019-06-29 06:42:54 +00:00			`bookmark.title = link_tag.string`
#47 Fix description import (#48) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io> 2020-12-29 12:14:10 +00:00			`if description:`
			`bookmark.description = description`
Implement basic importer 2019-06-29 06:42:54 +00:00			`bookmark.owner = user`

			`bookmark.save()`

Implement tag model 2019-06-30 05:15:46 +00:00			`# Set tags`
Use a default value if attribute missing from tag 2020-06-06 17:27:43 +00:00			`tag_string = link_tag.get('tags', '')`
Edit bookmark tags 2019-07-01 20:05:38 +00:00			`tag_names = parse_tag_string(tag_string)`
			`tags = get_or_create_tags(tag_names, user)`
Implement tag model 2019-06-30 05:15:46 +00:00
			`bookmark.tags.set(tags)`
			`bookmark.save()`

Implement basic importer 2019-06-29 06:42:54 +00:00
			`def _get_or_create_bookmark(url: str, user: User):`
			`try:`
			`return Bookmark.objects.get(url=url, owner=user)`
			`except Bookmark.DoesNotExist:`
			`return Bookmark()`
#47 Fix description import (#48) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io> 2020-12-29 12:14:10 +00:00

			`def _extract_description(bookmark_tag: bs4.Tag):`
			`"""`
			`Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag`
			`so to extract the description text we have to get creative. For now we combine the text of all text nodes until we`
			`detect a <dt> tag which indicates a new bookmark`
			`:param bookmark_tag:`
			`:return:`
			`"""`
			`description_tag = bookmark_tag.find('dd', recursive=False)`

			`if description_tag is None:`
			`return None`

			`description = ''`

			`for content in description_tag.contents:`
			`if type(content) is bs4.element.Tag and content.name == 'dt':`
			`break`
			`if type(content) is bs4.element.NavigableString:`
			`description += content`

			`return description.strip()`