Implement custom netscape file parser (#51)
* Implement custom Netscape file parser (#50) * Add environment variable to configure request timeouts (#50) Co-authored-by: Sascha Ißbrücker <sissbruecker@lyska.io>
This commit is contained in:
parent
50a825b3ca
commit
779de41b65
15
README.md
15
README.md
|
@ -71,6 +71,21 @@ For automatic backups you want to backup the applications database. As described
|
||||||
|
|
||||||
The application provides a REST API that can be used by 3rd party applications to manage bookmarks. Check the [API docs](API.md) for further information.
|
The application provides a REST API that can be used by 3rd party applications to manage bookmarks. Check the [API docs](API.md) for further information.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**Import fails with `502 Bad Gateway`**
|
||||||
|
|
||||||
|
The default timeout for requests is 60 seconds, after which the application server will cancel the request and return the above error.
|
||||||
|
Depending on the system that the application runs on, and the number of bookmarks that need to be imported, the import may take longer than the default 60 seconds.
|
||||||
|
|
||||||
|
To increase the timeout you can provide a custom timeout to the Docker container using the `LD_REQUEST_TIMEOUT` environment variable:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run --name linkding -p 9090:9090 -e LD_REQUEST_TIMEOUT=180 -d sissbruecker/linkding:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that any proxy servers that you are running in front of linkding may have their own timeout settings, which are not affected by the variable.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
The application is open source, so you are free to modify or contribute. The application is built using the Django web framework. You can get started by checking out the excellent Django docs: https://docs.djangoproject.com/en/3.0/. The `bookmarks` folder contains the actual bookmark application, `siteroot` is the Django root application. Other than that the code should be self-explanatory / standard Django stuff 🙂.
|
The application is open source, so you are free to modify or contribute. The application is built using the Django web framework. You can get started by checking out the excellent Django docs: https://docs.djangoproject.com/en/3.0/. The `bookmarks` folder contains the actual bookmark application, `siteroot` is the Django root application. Other than that the code should be self-explanatory / standard Django stuff 🙂.
|
||||||
|
|
|
@ -2,11 +2,10 @@ import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import bs4
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
|
|
||||||
from bookmarks.models import Bookmark, parse_tag_string
|
from bookmarks.models import Bookmark, parse_tag_string
|
||||||
|
from bookmarks.services.parser import parse, NetscapeBookmark
|
||||||
from bookmarks.services.tags import get_or_create_tags
|
from bookmarks.services.tags import get_or_create_tags
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -23,52 +22,41 @@ def import_netscape_html(html: str, user: User):
|
||||||
result = ImportResult()
|
result = ImportResult()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
netscape_bookmarks = parse(html)
|
||||||
except:
|
except:
|
||||||
logging.exception('Could not read bookmarks file.')
|
logging.exception('Could not read bookmarks file.')
|
||||||
raise
|
raise
|
||||||
|
|
||||||
bookmark_tags = soup.find_all('dt')
|
for netscape_bookmark in netscape_bookmarks:
|
||||||
|
|
||||||
for bookmark_tag in bookmark_tags:
|
|
||||||
result.total = result.total + 1
|
result.total = result.total + 1
|
||||||
try:
|
try:
|
||||||
_import_bookmark_tag(bookmark_tag, user)
|
_import_bookmark_tag(netscape_bookmark, user)
|
||||||
result.success = result.success + 1
|
result.success = result.success + 1
|
||||||
except:
|
except:
|
||||||
shortened_bookmark_tag_str = str(bookmark_tag)[:100] + '...'
|
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
|
||||||
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
|
||||||
result.failed = result.failed + 1
|
result.failed = result.failed + 1
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _import_bookmark_tag(bookmark_tag: bs4.Tag, user: User):
|
def _import_bookmark_tag(netscape_bookmark: NetscapeBookmark, user: User):
|
||||||
link_tag = bookmark_tag.a
|
|
||||||
|
|
||||||
if link_tag is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Either modify existing bookmark for the URL or create new one
|
# Either modify existing bookmark for the URL or create new one
|
||||||
url = link_tag['href']
|
bookmark = _get_or_create_bookmark(netscape_bookmark.href, user)
|
||||||
description = _extract_description(bookmark_tag)
|
|
||||||
bookmark = _get_or_create_bookmark(url, user)
|
|
||||||
|
|
||||||
bookmark.url = url
|
bookmark.url = netscape_bookmark.href
|
||||||
add_date = link_tag.get('add_date', datetime.now().timestamp())
|
bookmark.date_added = datetime.utcfromtimestamp(int(netscape_bookmark.date_added)).astimezone()
|
||||||
bookmark.date_added = datetime.utcfromtimestamp(int(add_date)).astimezone()
|
|
||||||
bookmark.date_modified = bookmark.date_added
|
bookmark.date_modified = bookmark.date_added
|
||||||
bookmark.unread = link_tag.get('toread', '0') == '1'
|
bookmark.unread = False
|
||||||
bookmark.title = link_tag.string
|
bookmark.title = netscape_bookmark.title
|
||||||
if description:
|
if netscape_bookmark.description:
|
||||||
bookmark.description = description
|
bookmark.description = netscape_bookmark.description
|
||||||
bookmark.owner = user
|
bookmark.owner = user
|
||||||
|
|
||||||
bookmark.save()
|
bookmark.save()
|
||||||
|
|
||||||
# Set tags
|
# Set tags
|
||||||
tag_string = link_tag.get('tags', '')
|
tag_names = parse_tag_string(netscape_bookmark.tag_string)
|
||||||
tag_names = parse_tag_string(tag_string)
|
|
||||||
tags = get_or_create_tags(tag_names, user)
|
tags = get_or_create_tags(tag_names, user)
|
||||||
|
|
||||||
bookmark.tags.set(tags)
|
bookmark.tags.set(tags)
|
||||||
|
@ -80,27 +68,3 @@ def _get_or_create_bookmark(url: str, user: User):
|
||||||
return Bookmark.objects.get(url=url, owner=user)
|
return Bookmark.objects.get(url=url, owner=user)
|
||||||
except Bookmark.DoesNotExist:
|
except Bookmark.DoesNotExist:
|
||||||
return Bookmark()
|
return Bookmark()
|
||||||
|
|
||||||
|
|
||||||
def _extract_description(bookmark_tag: bs4.Tag):
|
|
||||||
"""
|
|
||||||
Since the Netscape HTML format has no closing tags, all following bookmark tags are part of the description tag
|
|
||||||
so to extract the description text we have to get creative. For now we combine the text of all text nodes until we
|
|
||||||
detect a <dt> tag which indicates a new bookmark
|
|
||||||
:param bookmark_tag:
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
description_tag = bookmark_tag.find('dd', recursive=False)
|
|
||||||
|
|
||||||
if description_tag is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
description = ''
|
|
||||||
|
|
||||||
for content in description_tag.contents:
|
|
||||||
if type(content) is bs4.element.Tag and content.name == 'dt':
|
|
||||||
break
|
|
||||||
if type(content) is bs4.element.NavigableString:
|
|
||||||
description += content
|
|
||||||
|
|
||||||
return description.strip()
|
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pyparsing as pp
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NetscapeBookmark:
|
||||||
|
href: str
|
||||||
|
title: str
|
||||||
|
description: str
|
||||||
|
date_added: int
|
||||||
|
tag_string: str
|
||||||
|
|
||||||
|
|
||||||
|
def extract_bookmark_link(tag):
|
||||||
|
href = tag[0].href
|
||||||
|
title = tag[0].text
|
||||||
|
tag_string = tag[0].tags
|
||||||
|
date_added_string = tag[0].add_date if tag[0].add_date else datetime.now().timestamp()
|
||||||
|
date_added = int(date_added_string)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'href': href,
|
||||||
|
'title': title,
|
||||||
|
'tag_string': tag_string,
|
||||||
|
'date_added': date_added
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_bookmark(tag):
|
||||||
|
link = tag[0].link
|
||||||
|
description = tag[0].description
|
||||||
|
description = description[0] if description else ''
|
||||||
|
|
||||||
|
return {
|
||||||
|
'link': link,
|
||||||
|
'description': description,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_description(tag):
|
||||||
|
return tag[0].strip()
|
||||||
|
|
||||||
|
|
||||||
|
# define grammar
|
||||||
|
dt_start, _ = pp.makeHTMLTags("DT")
|
||||||
|
dd_start, _ = pp.makeHTMLTags("DD")
|
||||||
|
a_start, a_end = pp.makeHTMLTags("A")
|
||||||
|
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress())
|
||||||
|
bookmark_link_tag.addParseAction(extract_bookmark_link)
|
||||||
|
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(pp.anyOpenTag | pp.anyCloseTag)("description")
|
||||||
|
bookmark_description_tag.addParseAction(extract_description)
|
||||||
|
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description"))
|
||||||
|
bookmark_tag.addParseAction(extract_bookmark)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(html: str) -> [NetscapeBookmark]:
|
||||||
|
matches = bookmark_tag.searchString(html)
|
||||||
|
bookmarks = []
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
bookmark_match = match[0]
|
||||||
|
bookmark = NetscapeBookmark(
|
||||||
|
href=bookmark_match['link']['href'],
|
||||||
|
title=bookmark_match['link']['title'],
|
||||||
|
description=bookmark_match['description'],
|
||||||
|
tag_string=bookmark_match['link']['tag_string'],
|
||||||
|
date_added=bookmark_match['link']['date_added'],
|
||||||
|
)
|
||||||
|
bookmarks.append(bookmark)
|
||||||
|
|
||||||
|
return bookmarks
|
|
@ -35,7 +35,7 @@ def bookmark_import(request):
|
||||||
return HttpResponseRedirect(reverse('bookmarks:settings.index'))
|
return HttpResponseRedirect(reverse('bookmarks:settings.index'))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content = import_file.read()
|
content = import_file.read().decode()
|
||||||
result = import_netscape_html(content, request.user)
|
result = import_netscape_html(content, request.user)
|
||||||
success_msg = str(result.success) + ' bookmarks were successfully imported.'
|
success_msg = str(result.success) + ' bookmarks were successfully imported.'
|
||||||
messages.success(request, success_msg, 'bookmark_import_success')
|
messages.success(request, success_msg, 'bookmark_import_success')
|
||||||
|
|
|
@ -11,6 +11,7 @@ django-sass-processor==0.7.3
|
||||||
django-widget-tweaks==1.4.5
|
django-widget-tweaks==1.4.5
|
||||||
djangorestframework==3.11.1
|
djangorestframework==3.11.1
|
||||||
idna==2.8
|
idna==2.8
|
||||||
|
pyparsing==2.4.7
|
||||||
pytz==2019.1
|
pytz==2019.1
|
||||||
rcssmin==1.0.6
|
rcssmin==1.0.6
|
||||||
requests==2.22.0
|
requests==2.22.0
|
||||||
|
|
|
@ -13,6 +13,7 @@ django-widget-tweaks==1.4.5
|
||||||
djangorestframework==3.11.1
|
djangorestframework==3.11.1
|
||||||
idna==2.8
|
idna==2.8
|
||||||
libsass==0.19.2
|
libsass==0.19.2
|
||||||
|
pyparsing==2.4.7
|
||||||
pytz==2019.1
|
pytz==2019.1
|
||||||
rcssmin==1.0.6
|
rcssmin==1.0.6
|
||||||
requests==2.22.0
|
requests==2.22.0
|
||||||
|
|
Loading…
Reference in New Issue