Skip to content

Instantly share code, notes, and snippets.

@pushpendrapratap
Last active May 26, 2019 15:29
Show Gist options
  • Save pushpendrapratap/ed2dd1714bad002b05a25e90042ea93a to your computer and use it in GitHub Desktop.
Save pushpendrapratap/ed2dd1714bad002b05a25e90042ea93a to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
# copied from https://github.com/bookmarks-tools/bookmarks-parser/blob/master/bookmarks_parser/bookmarks_parser.py
def get_node_data(node):
data = {}
for child in node:
if child.name == 'a':
data['type'] = 'bookmark'
data['url'] = child.get('href')
data['title'] = child.text
data['add_date'] = child.get('add_date')
data['icon'] = child.get('icon')
# only in FF
icon_uri = child.get('icon_uri')
if icon_uri:
data['icon_uri'] = icon_uri
tags = child.get('tags')
if tags:
data['tags'] = tags.split(',')
elif child.name == 'h3':
data['type'] = 'folder'
data['title'] = child.text
data['add_date'] = child.get('add_date')
data['last_modified'] = child.get('last_modified')
data['ns_root'] = None
# for Bookmarks Toolbar in FF and Bookmarks bar in Chrome
if child.get('personal_toolbar_folder'):
data['ns_root'] = 'toolbar'
# FF Other Bookmarks
if child.get('unfiled_bookmarks_folder'):
data['ns_root'] = 'other_bookmarks'
elif child.name == 'dl':
# store DL element reference for further processing the child nodes
data['__dir_dl'] = child
if data['type'] == 'folder' and not data.get('__dir_dl'):
if node.next_sibling and node.next_sibling.name == "dd":
dls = node.next_sibling.find_all('dl')
if dls:
data['__dir_dl'] = dls[0]
return data
def process_dir(bookmark_dir, level):
items = []
menu_root = None
for child in bookmark_dir:
if child.name != 'dt':
continue
item_data = get_node_data(child)
if level == 0 and (not item_data.get('ns_root')):
if menu_root is None:
# For chrome
if child.previous_sibling.name == "dt":
menu_root = {'title': "Other bookmarks", 'children': [], 'ns_root': 'menu'}
# for FF
else:
menu_root = {'title': "Bookmarks Menu", 'children': [], 'ns_root': 'menu'}
if item_data.get('__dir_dl'):
item_data['children'] = process_dir(item_data['__dir_dl'], level + 1)
del item_data['__dir_dl']
menu_root['children'].append(item_data)
else:
if item_data.get('__dir_dl'):
item_data['children'] = process_dir(item_data['__dir_dl'], level + 1)
del item_data['__dir_dl']
items.append(item_data)
if menu_root:
items.append(menu_root)
return items
def parse(file_path):
with open(file_path, 'rb') as f:
soup = BeautifulSoup(f, "html5lib")
dls = soup.find_all('dl')
bookmarks = process_dir(dls[0], 0)
return bookmarks
@pushpendrapratap
Copy link
Author

Parsing Netscape bookmark (Google Chrome, Firefox, ... export files) using bs4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment