Implement custom netscape file parser (#51)

* Implement custom Netscape file parser (#50)

* Add environment variable to configure request timeouts (#50)

Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
This commit is contained in:
Sascha Ißbrücker 2020-12-31 07:02:28 +01:00 committed by GitHub
parent 50a825b3ca
commit 779de41b65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 111 additions and 51 deletions

View File

@ -71,6 +71,21 @@ For automatic backups you want to backup the applications database. As described
The application provides a REST API that can be used by 3rd party applications to manage bookmarks. Check the [API docs](API.md) for further information.
## Troubleshooting
**Import fails with `502 Bad Gateway`**
The default timeout for requests is 60 seconds, after which the application server will cancel the request and return the above error.
Depending on the system that the application runs on, and the number of bookmarks that need to be imported, the import may take longer than the default 60 seconds.
To increase the timeout you can provide a custom timeout to the Docker container using the `LD_REQUEST_TIMEOUT` environment variable:
```
docker run --name linkding -p 9090:9090 -e LD_REQUEST_TIMEOUT=180 -d sissbruecker/linkding:latest
```
Note that any proxy servers that you are running in front of linkding may have their own timeout settings, which are not affected by the variable.
## Development
The application is open source, so you are free to modify or contribute. The application is built using the Django web framework. You can get started by checking out the excellent Django docs: https://docs.djangoproject.com/en/3.0/. The `bookmarks` folder contains the actual bookmark application, `siteroot` is the Django root application. Other than that the code should be self-explanatory / standard Django stuff 🙂.

View File

@ -2,11 +2,10 @@ import logging
from dataclasses import dataclass
from datetime import datetime
import bs4
from bs4 import BeautifulSoup
from django.contrib.auth.models import User
from bookmarks.models import Bookmark, parse_tag_string
from bookmarks.services.parser import parse, NetscapeBookmark
from bookmarks.services.tags import get_or_create_tags
logger = logging.getLogger(__name__)
@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
result = ImportResult()
try:
soup = BeautifulSoup(html, 'html.parser')
netscape_bookmarks = parse(html)
except:
logging.exception('Could not read bookmarks file.')
raise
bookmark_tags = soup.find_all('dt')
for bookmark_tag in bookmark_tags:
for netscape_bookmark in netscape_bookmarks:
result.total = result.total + 1
try:
_import_bookmark_tag(bookmark_tag, user)
_import_bookmark_tag(netscape_bookmark, user)
result.success = result.success + 1
except:
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
result.failed = result.failed + 1
return result
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
link_tag = bookmark_tag.a
if link_tag is None:
return
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
# Either modify existing bookmark for the URL or create new one
url = link_tag['href']
description = _extract_description(bookmark_tag)
bookmark = _get_or_create_bookmark(url, user)
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
bookmark.url = url
add_date = link_tag.get('add_date', datetime.now().timestamp())
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
bookmark.url = netscape_bookmark.href
bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
bookmark.date_modified = bookmark.date_added
bookmark.unread = link_tag.get('toread', '0') == '1'
bookmark.title = link_tag.string
if description:
bookmark.description = description
bookmark.unread = False
bookmark.title = netscape_bookmark.title
if netscape_bookmark.description:
bookmark.description = netscape_bookmark.description
bookmark.owner = user
bookmark.save()
# Set tags
tag_string = link_tag.get('tags', '')
tag_names = parse_tag_string(tag_string)
tag_names = parse_tag_string(netscape_bookmark.tag_string)
tags = get_or_create_tags(tag_names, user)
bookmark.tags.set(tags)
@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
return Bookmark.objects.get(url=url, owner=user)
except Bookmark.DoesNotExist:
return Bookmark()
def _extract_description(bookmark_tag: bs4.Tag):
"""
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
detect a <dt> tag which indicates a new bookmark
:param bookmark_tag:
:return:
"""
description_tag = bookmark_tag.find('dd', recursive=False)
if description_tag is None:
return None
description = ''
for content in description_tag.contents:
if type(content) is bs4.element.Tag and content.name == 'dt':
break
if type(content) is bs4.element.NavigableString:
description += content
return description.strip()

View File

@ -0,0 +1,73 @@
from dataclasses import dataclass
from datetime import datetime
import pyparsing as pp
@dataclass
class NetscapeBookmark:
href: str
title: str
description: str
date_added: int
tag_string: str
def extract_bookmark_link(tag):
href = tag[0].href
title = tag[0].text
tag_string = tag[0].tags
date_added_string = tag[0].add_date if tag[0].add_date else datetime.now().timestamp()
date_added = int(date_added_string)
return {
'href': href,
'title': title,
'tag_string': tag_string,
'date_added': date_added
}
def extract_bookmark(tag):
link = tag[0].link
description = tag[0].description
description = description[0] if description else ''
return {
'link': link,
'description': description,
}
def extract_description(tag):
return tag[0].strip()
# define grammar
dt_start, _ = pp.makeHTMLTags("DT")
dd_start, _ = pp.makeHTMLTags("DD")
a_start, a_end = pp.makeHTMLTags("A")
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
bookmark_link_tag.addParseAction(extract_bookmark_link)
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
bookmark_description_tag.addParseAction(extract_description)
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
bookmark_tag.addParseAction(extract_bookmark)
def parse(html: str) -> [NetscapeBookmark]:
matches = bookmark_tag.searchString(html)
bookmarks = []
for match in matches:
bookmark_match = match[0]
bookmark = NetscapeBookmark(
href=bookmark_match['link']['href'],
title=bookmark_match['link']['title'],
description=bookmark_match['description'],
tag_string=bookmark_match['link']['tag_string'],
date_added=bookmark_match['link']['date_added'],
)
bookmarks.append(bookmark)
return bookmarks

View File

@ -35,7 +35,7 @@ def bookmark_import(request):
return HttpResponseRedirect(reverse('bookmarks:settings.index'))
try:
content = import_file.read()
content = import_file.read().decode()
result = import_netscape_html(content, request.user)
success_msg = str(result.success) + ' bookmarks were successfully imported.'
messages.success(request, success_msg, 'bookmark_import_success')

View File

@ -11,6 +11,7 @@ django-sass-processor==0.7.3
django-widget-tweaks==1.4.5
djangorestframework==3.11.1
idna==2.8
pyparsing==2.4.7
pytz==2019.1
rcssmin==1.0.6
requests==2.22.0

View File

@ -13,6 +13,7 @@ django-widget-tweaks==1.4.5
djangorestframework==3.11.1
idna==2.8
libsass==0.19.2
pyparsing==2.4.7
pytz==2019.1
rcssmin==1.0.6
requests==2.22.0

View File

@ -11,3 +11,9 @@ vacuum=True
stats = 127.0.0.1:9191
uid = www-data
gid = www-data
if-env = LD_REQUEST_TIMEOUT
http-timeout = %(_)
socket-timeout = %(_)
harakiri = %(_)
endif =