-
-
Save danmou/46e21b0c7090cb27425fa389f5102b16 to your computer and use it in GitHub Desktop.
| ### README | |
| # This Python scripts exports all the OneNote notebooks linked to your Microsoft account to HTML files. | |
| ## Output | |
| # The notebooks will each become a subdirectory of the `output` folder, with further subdirectories | |
| # for the sections within each notebook and the pages within each section. Each page is a directory | |
| # containing the HTML file `main.html` and two directories `images` and `attachments` (if necessary) | |
| # for the images and attachments. Any sub-pages will be subdirectories within this one. | |
| ## Setup | |
| # In order to run the script, you must first do the following: | |
| # 1. Go to https://aad.portal.azure.com/ and log in with your Microsoft account. | |
| # 2. Select "Azure Active Directory" and then "App registrations" under "Manage". | |
| # 3. Select "New registration". Choose any name, set "Supported account types" to "Accounts in any | |
| # organizational directory and personal Microsoft accounts" and under "Redirect URI", select Web | |
| # and enter `http://localhost:5000/getToken`. Register. | |
| # 4. Copy "Application (client) ID" and paste it as `client_id` below in this script. | |
| # 5. Select "Certificates & secrets" under "Manage". Press "New client secret", choose a name and | |
| # confirm. | |
| # 6. Copy the client secret and paste it as `secret` below in this script. | |
| # 7. Select "API permissions" under "Manage". Press "Add a permission", scroll down and select OneNote, | |
| # choose "Delegated permissions" and check "Notes.Read" and "Notes.Read.All". Press "Add | |
| # permissions". | |
| # 8. Make sure you have Python 3.7 (or newer) installed and install the dependencies using the command | |
| # `pip install flask msal requests_oauthlib`. | |
| ## Running | |
| # In a terminal, navigate to the directory where this script is located and run it using | |
| # `python onenote_export.py`. This will start a local web server on port 5000. | |
| # In your browser navigate to http://localhost:5000 and log in to your Microsoft account. | |
| # The first time you do it, you will also have to accept that the app can read your OneNote notes. | |
| # (This does not give any third parties access to your data, as long as you don't share the client id | |
| # and secret you created on the Azure portal). After this, go back to the terminal to follow the progress. | |
| ## Note | |
| # Microsoft limits how many requests you can do within a given time period. Therefore, if you have many | |
| # notes you might eventually see messages like this in the terminal: "Too many requests, waiting 20s and | |
| # trying again." This is not a problem, but it means the entire process can take a while. Also, the login | |
| # session can expire after a while, which results in a TokenExpiredError. If this happens, simply reload | |
| # `http://localhost:5000` and the script will continue (skipping the files it already downloaded). | |
| client_id = '...' | |
| secret = '...' | |
| import os | |
| import random | |
| import re | |
| import shutil | |
| import string | |
| import time | |
| import uuid | |
| from html.parser import HTMLParser | |
| from pathlib import Path | |
| from xml.etree import ElementTree | |
| import flask | |
| import msal | |
| from requests_oauthlib import OAuth2Session | |
| output_path = Path('output') | |
| graph_url = 'https://graph.microsoft.com/v1.0' | |
| authority_url = 'https://login.microsoftonline.com/common' | |
| scopes = ['Notes.Read', 'Notes.Read.All'] | |
| redirect_uri = 'http://localhost:5000/getToken' | |
| app = flask.Flask(__name__) | |
| app.debug = True | |
| app.secret_key = os.urandom(16) | |
| application = msal.ConfidentialClientApplication( | |
| client_id, | |
| authority=authority_url, | |
| client_credential=secret | |
| ) | |
| @app.route("/") | |
| def main(): | |
| resp = flask.Response(status=307) | |
| resp.headers['location'] = '/login' | |
| return resp | |
| @app.route("/login") | |
| def login(): | |
| auth_state = str(uuid.uuid4()) | |
| flask.session['state'] = auth_state | |
| authorization_url = application.get_authorization_request_url(scopes, state=auth_state, | |
| redirect_uri=redirect_uri) | |
| resp = flask.Response(status=307) | |
| resp.headers['location'] = authorization_url | |
| return resp | |
| def get_json(graph_client, url, params=None): | |
| values = [] | |
| next_page = url | |
| while next_page: | |
| resp = get(graph_client, next_page, params=params).json() | |
| if 'value' not in resp: | |
| raise RuntimeError(f'Invalid server response: {resp}') | |
| values += resp['value'] | |
| next_page = resp.get('@odata.nextLink') | |
| return values | |
| def get(graph_client, url, params=None): | |
| while True: | |
| resp = graph_client.get(url, params=params) | |
| if resp.status_code == 429: | |
| # We are being throttled due to too many requests. | |
| # See https://docs.microsoft.com/en-us/graph/throttling | |
| print(' Too many requests, waiting 20s and trying again.') | |
| time.sleep(20) | |
| elif resp.status_code == 500: | |
| # In my case, one specific note page consistently gave this status | |
| # code when trying to get the content. The error was "19999: | |
| # Something failed, the API cannot share any more information | |
| # at the time of the request." | |
| print(' Error 500, skipping this page.') | |
| return None | |
| else: | |
| resp.raise_for_status() | |
| return resp | |
| def download_attachments(graph_client, content, out_dir): | |
| image_dir = out_dir / 'images' | |
| attachment_dir = out_dir / 'attachments' | |
| # if image_dir.exists(): | |
| # shutil.rmtree(image_dir) | |
| class MyHTMLParser(HTMLParser): | |
| def handle_starttag(self, tag, attrs): | |
| self.attrs = {k: v for k, v in attrs} | |
| def generate_html(tag, props): | |
| element = ElementTree.Element(tag, attrib=props) | |
| return ElementTree.tostring(element, encoding='unicode') | |
| def download_image(tag_match): | |
| # <img width="843" height="218.5" src="..." data-src-type="image/png" data-fullres-src="..." data-fullres-src-type="image/png" /> | |
| parser = MyHTMLParser() | |
| parser.feed(tag_match[0]) | |
| props = parser.attrs | |
| image_url = props.get('data-fullres-src', props['src']) | |
| image_type = props.get('data-fullres-src-type', props['data-src-type']).split("/")[-1] | |
| file_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) + '.' + image_type | |
| img = get(graph_client, image_url).content | |
| print(f' Downloaded image of {len(img)} bytes.') | |
| image_dir.mkdir(exist_ok=True) | |
| with open(image_dir / file_name, "wb") as f: | |
| f.write(img) | |
| props['src'] = "images/" + file_name | |
| props = {k: v for k, v in props.items() if not 'data-fullres-src' in k} | |
| return generate_html('img', props) | |
| def download_attachment(tag_match): | |
| # <object data-attachment="Trig_Cheat_Sheet.pdf" type="application/pdf" data="..." style="position:absolute;left:528px;top:139px" /> | |
| parser = MyHTMLParser() | |
| parser.feed(tag_match[0]) | |
| props = parser.attrs | |
| data_url = props['data'] | |
| file_name = props['data-attachment'] | |
| if (attachment_dir / file_name).exists(): | |
| print(f' Attachment {file_name} already downloaded; skipping.') | |
| else: | |
| data = get(graph_client, data_url).content | |
| print(f' Downloaded attachment {file_name} of {len(data)} bytes.') | |
| attachment_dir.mkdir(exist_ok=True) | |
| with open(attachment_dir / file_name, "wb") as f: | |
| f.write(data) | |
| props['data'] = "attachments/" + file_name | |
| return generate_html('object', props) | |
| content = re.sub(r"<img .*?\/>", download_image, content, flags=re.DOTALL) | |
| content = re.sub(r"<object .*?\/>", download_attachment, content, flags=re.DOTALL) | |
| return content | |
| @app.route("/getToken") | |
| def main_logic(): | |
| code = flask.request.args['code'] | |
| token = application.acquire_token_by_authorization_code(code, scopes=scopes, | |
| redirect_uri=redirect_uri) | |
| graph_client = OAuth2Session(token=token) | |
| notebooks = get_json(graph_client, f'{graph_url}/me/onenote/notebooks') | |
| print(f'Got {len(notebooks)} notebooks.') | |
| for nb in notebooks: | |
| nb_name = nb["displayName"] | |
| print(f'Opening notebook {nb_name}') | |
| sections = get_json(graph_client, nb['sectionsUrl']) | |
| print(f' Got {len(sections)} sections.') | |
| for sec in sections: | |
| sec_name = sec["displayName"] | |
| print(f' Opening section {sec_name}') | |
| pages = get_json(graph_client, sec['pagesUrl'] + '?pagelevel=true') | |
| print(f' Got {len(pages)} pages.') | |
| pages = sorted([(page['order'], page) for page in pages]) | |
| level_dirs = [None]*4 | |
| for order, page in pages: | |
| level = page['level'] | |
| page_title = f'{order}_{page["title"]}' | |
| print(f' Opening page {page_title}') | |
| if level == 0: | |
| out_dir = output_path / nb_name / sec_name / page_title | |
| else: | |
| out_dir = level_dirs[level - 1] / page_title | |
| level_dirs[level] = out_dir | |
| out_html = out_dir / 'main.html' | |
| if out_html.exists(): | |
| print(' HTML file already exists; skipping this page') | |
| continue | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| response = get(graph_client, page['contentUrl']) | |
| if response is not None: | |
| content = response.text | |
| print(f' Got content of length {len(content)}') | |
| content = download_attachments(graph_client, content, out_dir) | |
| with open(out_html, "w") as f: | |
| f.write(content) | |
| print("Done!") | |
| return flask.render_template_string('<html><head><title>Done</title></head><body><p1><b>Done</b></p1></body></html>') | |
| if __name__ == "__main__": | |
| app.run() |
@minghao51 Thanks! Sure, here you go: https://github.com/Danmou/onenote_export
I added filename sanitizing, but PRs with further improvements are welcome!
Good job! Thanks for sharing your idea!
This is awesome and works like a charm. I had a lot of recursive section groups. Therefore I added this to the export and had to handle one error with broken images. If of interest see my fork: https://gist.github.com/sspaeti/8daab59a80adc664fa8cbcba707ea21d/revisions
The changes are mainly recursion_section_group(). I just formated that's why it looks like a lot of differences.
@sspaeti I'm not sure what you mean by recursive section groups, but if you think your change is generally useful and backwards compatible, feel free to submit a PR to the repo I linked above
I meant Section Groups were not handled in your example. And even less if you have a section group in a section group (this can be nested quite heavily, what I did ;-). That's what I added to your script in the example above, just in case someone else has also lots of these.
This is awesome, now I can start my own free blogging channel, Thanks a lot.
This is awesome, would you be keen to transfer this into a repository so people can contribute?