Last active
May 26, 2019 15:29
-
-
Save pushpendrapratap/ed2dd1714bad002b05a25e90042ea93a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| # copied from https://github.com/bookmarks-tools/bookmarks-parser/blob/master/bookmarks_parser/bookmarks_parser.py | |
| def get_node_data(node): | |
| data = {} | |
| for child in node: | |
| if child.name == 'a': | |
| data['type'] = 'bookmark' | |
| data['url'] = child.get('href') | |
| data['title'] = child.text | |
| data['add_date'] = child.get('add_date') | |
| data['icon'] = child.get('icon') | |
| # only in FF | |
| icon_uri = child.get('icon_uri') | |
| if icon_uri: | |
| data['icon_uri'] = icon_uri | |
| tags = child.get('tags') | |
| if tags: | |
| data['tags'] = tags.split(',') | |
| elif child.name == 'h3': | |
| data['type'] = 'folder' | |
| data['title'] = child.text | |
| data['add_date'] = child.get('add_date') | |
| data['last_modified'] = child.get('last_modified') | |
| data['ns_root'] = None | |
| # for Bookmarks Toolbar in FF and Bookmarks bar in Chrome | |
| if child.get('personal_toolbar_folder'): | |
| data['ns_root'] = 'toolbar' | |
| # FF Other Bookmarks | |
| if child.get('unfiled_bookmarks_folder'): | |
| data['ns_root'] = 'other_bookmarks' | |
| elif child.name == 'dl': | |
| # store DL element reference for further processing the child nodes | |
| data['__dir_dl'] = child | |
| if data['type'] == 'folder' and not data.get('__dir_dl'): | |
| if node.next_sibling and node.next_sibling.name == "dd": | |
| dls = node.next_sibling.find_all('dl') | |
| if dls: | |
| data['__dir_dl'] = dls[0] | |
| return data | |
| def process_dir(bookmark_dir, level): | |
| items = [] | |
| menu_root = None | |
| for child in bookmark_dir: | |
| if child.name != 'dt': | |
| continue | |
| item_data = get_node_data(child) | |
| if level == 0 and (not item_data.get('ns_root')): | |
| if menu_root is None: | |
| # For chrome | |
| if child.previous_sibling.name == "dt": | |
| menu_root = {'title': "Other bookmarks", 'children': [], 'ns_root': 'menu'} | |
| # for FF | |
| else: | |
| menu_root = {'title': "Bookmarks Menu", 'children': [], 'ns_root': 'menu'} | |
| if item_data.get('__dir_dl'): | |
| item_data['children'] = process_dir(item_data['__dir_dl'], level + 1) | |
| del item_data['__dir_dl'] | |
| menu_root['children'].append(item_data) | |
| else: | |
| if item_data.get('__dir_dl'): | |
| item_data['children'] = process_dir(item_data['__dir_dl'], level + 1) | |
| del item_data['__dir_dl'] | |
| items.append(item_data) | |
| if menu_root: | |
| items.append(menu_root) | |
| return items | |
| def parse(file_path): | |
| with open(file_path, 'rb') as f: | |
| soup = BeautifulSoup(f, "html5lib") | |
| dls = soup.find_all('dl') | |
| bookmarks = process_dir(dls[0], 0) | |
| return bookmarks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Parsing Netscape bookmark (Google Chrome, Firefox, ... export files) using
bs4