Last active
August 28, 2025 01:53
-
-
Save willccbb/5c2032c839c7847107515709fbfabba2 to your computer and use it in GitHub Desktop.
Revisions
-
willccbb revised this gist
Mar 17, 2025 . 1 changed file with 24 additions and 23 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,25 +1,4 @@ # /// script # requires-python = ">=3.12" # dependencies = [ # "click", @@ -45,6 +24,28 @@ def get_arxiv_pdf(arxiv_id): from mistralai import Mistral from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk def get_arxiv_pdf(arxiv_id): """ Download the PDF for an arXiv ID Args: arxiv_id (str): arXiv ID Returns: bytes: PDF content """ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(pdf_url, headers=headers) if response.status_code != 200: raise ValueError(f"Could not download PDF. Status code: {response.status_code}") return response.content def extract_arxiv_id(url): """ @@ -608,4 +609,4 @@ def arxiv_to_markdown( if __name__ == "__main__": arxiv_to_markdown() -
willccbb created this gist
Mar 17, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,611 @@ def get_arxiv_pdf(arxiv_id): """ Download the PDF for an arXiv ID Args: arxiv_id (str): arXiv ID Returns: bytes: PDF content """ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(pdf_url, headers=headers) if response.status_code != 200: raise ValueError(f"Could not download PDF. Status code: {response.status_code}") return response.content# /// script # requires-python = ">=3.12" # dependencies = [ # "click", # "mistralai", # "markdown", # "requests", # "beautifulsoup4", # ] # /// import os import json import base64 import re import requests import tempfile from pathlib import Path import unicodedata from urllib.parse import urlparse import click import markdown from bs4 import BeautifulSoup from mistralai import Mistral from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk def extract_arxiv_id(url): """ Extract arXiv ID from an arXiv URL (abs, PDF, or HTML) Args: url (str): arXiv URL Returns: str: arXiv ID """ # Parse the URL parsed_url = urlparse(url) # Check if it's an arXiv URL if 'arxiv.org' not in parsed_url.netloc: raise ValueError("Not an arXiv URL") # Extract the arXiv ID path_parts = parsed_url.path.strip('/').split('/') # Handle different URL formats arxiv_id = None if 'abs' in path_parts: # Format: arxiv.org/abs/1234.56789 idx = path_parts.index('abs') if idx + 1 < len(path_parts): arxiv_id = path_parts[idx + 1] elif 'pdf' in path_parts: # Format: arxiv.org/pdf/1234.56789.pdf idx = path_parts.index('pdf') if idx + 1 < len(path_parts): arxiv_id = path_parts[idx + 1].replace('.pdf', '') elif 'html' in path_parts: # Format: arxiv.org/html/1234.56789 idx = path_parts.index('html') if idx + 1 < len(path_parts): arxiv_id = path_parts[idx + 1] else: # Try to find the ID in the last part of the path last_part = path_parts[-1] if re.match(r'\d+\.\d+', last_part): arxiv_id = last_part if not arxiv_id: raise ValueError("Could not extract arXiv ID from URL") return arxiv_id def get_arxiv_bibtex(arxiv_id): """ Download the BibTeX citation for an arXiv ID Args: arxiv_id (str): arXiv ID Returns: str: BibTeX citation text or None if not available """ # arXiv's BibTeX endpoint bibtex_url = f"https://arxiv.org/bibtex/{arxiv_id}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } try: response = requests.get(bibtex_url, headers=headers) if response.status_code != 200: print(f"Failed to get BibTeX: HTTP {response.status_code}") return None # Extract BibTeX content from the response soup = BeautifulSoup(response.text, 'html.parser') # First try to find the textarea that usually contains the BibTeX textarea = soup.find('textarea') if textarea: return textarea.get_text().strip() # If no textarea, try to extract pre-formatted text pre = soup.find('pre') if pre: return pre.get_text().strip() # Last resort: generate a basic BibTeX entry ourselves print("Could not find BibTeX on the page, generating a basic entry") # We'll need the title and authors abs_url = f"https://arxiv.org/abs/{arxiv_id}" abs_response = requests.get(abs_url, headers=headers) if abs_response.status_code == 200: abs_soup = BeautifulSoup(abs_response.text, 'html.parser') # Extract title title_elem = abs_soup.find('h1', class_='title') title = title_elem.get_text().replace('Title:', '').strip() if title_elem else "Unknown Title" # Extract authors authors_elem = abs_soup.find('div', class_='authors') authors = authors_elem.get_text().replace('Authors:', '').strip() if authors_elem else "Unknown Authors" # Extract year year = "2023" # Default to current year if we can't find it date_elem = abs_soup.find('div', class_='dateline') if date_elem: date_match = re.search(r'\b(19|20)\d{2}\b', date_elem.get_text()) if date_match: year = date_match.group(0) # Generate a simple BibTeX entry bibtex = f"""@article{{{arxiv_id}, title = {{{title}}}, author = {{{authors}}}, journal = {{arXiv preprint arXiv:{arxiv_id}}}, year = {{{year}}}, url = {{https://arxiv.org/abs/{arxiv_id}}}, }}""" return bibtex except Exception as e: print(f"Error fetching BibTeX: {str(e)}") return None return None def get_paper_metadata(arxiv_id): """ Get the paper title and submission date from arXiv abstract page Args: arxiv_id (str): arXiv ID Returns: tuple: (Paper title, Submission date) """ abs_url = f"https://arxiv.org/abs/{arxiv_id}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(abs_url, headers=headers) if response.status_code != 200: return "unknown-paper", None # Parse the HTML soup = BeautifulSoup(response.text, 'html.parser') # Extract title title = "unknown-paper" title_element = soup.find('h1', class_='title') if title_element: title = title_element.get_text().replace('Title:', '').strip() # Extract submission date submission_date = None submission_element = soup.find('div', class_='submission-history') if submission_element: # Look for the first submission date submission_text = submission_element.get_text() match = re.search(r'\[v1\]\s+(.+?)\s+\(', submission_text) if not match: # Try alternative pattern without version match = re.search(r'Submitted\s+(.+?)\s+\(', submission_text) if match: submission_date = match.group(1).strip() return title, submission_date def sanitize_filename(title): """ Convert title to a clean filename format Args: title (str): Paper title Returns: str: Sanitized filename """ # Convert to lowercase and replace spaces with hyphens filename = title.lower() # Remove accents filename = ''.join(c for c in unicodedata.normalize('NFKD', filename) if not unicodedata.combining(c)) # Replace non-alphanumeric characters with hyphens filename = re.sub(r'[^a-z0-9]', '-', filename) # Replace multiple hyphens with a single hyphen filename = re.sub(r'-+', '-', filename) # Remove leading and trailing hyphens filename = filename.strip('-') return filename @click.command() @click.argument("arxiv_url") @click.option( "--api-key", help="Mistral API key. If not provided, will use MISTRAL_API_KEY environment variable.", envvar="MISTRAL_API_KEY", ) @click.option( "--model", help="Mistral OCR model to use.", default="mistral-ocr-latest" ) @click.option( "--json/--no-json", "-j/-J", "json_output", is_flag=True, default=False, help="Return raw JSON instead of markdown text/Return markdown text (default).", ) @click.option( "--html/--no-html", "-h/-H", is_flag=True, default=False, help="Convert markdown to HTML/Keep as markdown (default).", ) @click.option( "--inline-images/--no-inline-images", "-i/-I", is_flag=True, default=False, help="Include images inline as data URIs/Don't include inline images (default).", ) @click.option( "--extract-images/--no-extract-images", "-e/-E", is_flag=True, default=True, # Extract images by default help="Extract images as separate files (default)/Skip extracting images.", ) @click.option( "--silent/--verbose", "-s/-v", is_flag=True, default=False, help="Suppress all output except for the requested data/Show detailed progress (default).", ) @click.option( "--pages", type=int, default=20, help="Limit processing to the first N pages (default: 20).", ) def arxiv_to_markdown( arxiv_url, api_key, model, json_output, html, inline_images, extract_images, silent, pages, ): """Process an arXiv paper (given its URL) and convert it to markdown using Mistral OCR. ARXIV_URL is the URL of the arXiv paper (abs, PDF, or HTML format). The script will download the PDF version, process it with OCR, and save the result in the papers/ directory with a sanitized filename based on the paper title. \b Examples: python arxiv_ocr.py https://arxiv.org/abs/1706.03762 --api-key YOUR_API_KEY python arxiv_ocr.py https://arxiv.org/abs/1706.03762 --pages 5 --html """ # Validate API key if not api_key: raise click.ClickException("No API key provided and MISTRAL_API_KEY environment variable not set.") try: # Check if papers directory exists, create if not papers_dir = Path("papers") papers_dir.mkdir(exist_ok=True) # Extract arXiv ID from URL if not silent: click.echo(f"Extracting arXiv ID from URL: {arxiv_url}", err=True) arxiv_id = extract_arxiv_id(arxiv_url) if not silent: click.echo(f"Found arXiv ID: {arxiv_id}", err=True) # Get paper metadata if not silent: click.echo("Fetching paper metadata...", err=True) paper_title, submission_date = get_paper_metadata(arxiv_id) sanitized_title = sanitize_filename(paper_title) if not silent: click.echo(f"Paper title: {paper_title}", err=True) if submission_date: click.echo(f"Submission date: {submission_date}", err=True) click.echo(f"Sanitized filename: {sanitized_title}", err=True) # Download PDF if not silent: click.echo(f"Downloading PDF from arXiv...", err=True) pdf_content = get_arxiv_pdf(arxiv_id) # Download BibTeX citation if not silent: click.echo(f"Downloading BibTeX citation...", err=True) bibtex_content = get_arxiv_bibtex(arxiv_id) if bibtex_content: if not silent: click.echo(f"BibTeX citation retrieved successfully", err=True) else: if not silent: click.echo(f"Could not retrieve BibTeX citation", err=True) # Create temp file for PDF with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf: temp_pdf_path = Path(temp_pdf.name) temp_pdf.write(pdf_content) try: if not silent: click.echo(f"Downloaded PDF to {temp_pdf_path}", err=True) # Process PDF with Mistral OCR client = Mistral(api_key=api_key) uploaded_file = None try: # Upload PDF to Mistral if not silent: click.echo(f"Uploading file to Mistral...", err=True) uploaded_file = client.files.upload( file={ "file_name": f"{arxiv_id}.pdf", "content": pdf_content, }, purpose="ocr", ) # Get signed URL signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1) # Process with OCR if not silent: click.echo(f"Processing with OCR model: {model}...", err=True) # Prepare OCR processing parameters ocr_params = { "document": DocumentURLChunk(document_url=signed_url.url), "model": model, "include_image_base64": True, # Always request images } # Add pages parameter if limited pages are requested if pages > 0: ocr_params["pages"] = list(range(pages)) if not silent: click.echo(f"Limiting processing to first {pages} pages", err=True) pdf_response = client.ocr.process(**ocr_params) # Parse response response_dict = json.loads(pdf_response.model_dump_json()) # Define output paths - always create a directory structure output_dir = papers_dir / sanitized_title output_dir.mkdir(exist_ok=True) output_file = output_dir / "README.md" bibtex_file = output_dir / f"{sanitized_title}.bib" # For HTML output, use index.html instead of README.md if html: output_file = output_dir / "index.html" # Save BibTeX citation if available if bibtex_content: try: bibtex_file.write_text(bibtex_content) if not silent: click.echo(f"BibTeX citation saved to {bibtex_file}", err=True) except Exception as e: if not silent: click.echo(f"Error saving BibTeX file: {str(e)}", err=True) # Process images if needed image_map = {} if extract_images or inline_images: image_count = 0 # For extract_images, we need a directory if extract_images: image_dir = output_dir # Look for images in the OCR response for page in response_dict.get("pages", []): for img in page.get("images", []): if "id" in img and "image_base64" in img: image_id = img["id"] image_data = img["image_base64"] # Sometimes the base64 data has a data URI prefix, sometimes not if image_data.startswith("data:image/"): # Extract the mime type and base64 data mime_type = image_data.split(";")[0].split(":")[1] base64_data = image_data.split(",", 1)[1] else: # Determine mime type from file extension or default to jpeg ext = image_id.split(".")[-1].lower() if "." in image_id else "jpeg" mime_type = f"image/{ext}" base64_data = image_data # For extracted images, save to disk if extract_images: # Create a suitable filename if it doesn't have an extension if "." not in image_id: ext = mime_type.split("/")[1] image_filename = f"{image_id}.{ext}" else: image_filename = image_id image_path = image_dir / image_filename try: with open(image_path, "wb") as img_file: img_file.write(base64.b64decode(base64_data)) # Map image_id to relative path for referencing image_map[image_id] = image_filename image_count += 1 except Exception as e: if not silent: click.echo(f"Warning: Failed to save image {image_id}: {str(e)}", err=True) # For inline images, prepare data URIs elif inline_images: # Ensure it has the data URI prefix if not image_data.startswith("data:"): image_data = f"data:{mime_type};base64,{base64_data}" image_map[image_id] = image_data if not silent and extract_images and image_count > 0: click.echo(f"Extracted {image_count} images to {image_dir}", err=True) # Generate output content if json_output: result = json.dumps(response_dict, indent=4) else: # Concatenate markdown content from all pages markdown_contents = [ page.get("markdown", "") for page in response_dict.get("pages", []) ] markdown_text = "\n\n".join(markdown_contents) # Add metadata at the top markdown_text = f"# {paper_title}\n\n" # Add source information near the top markdown_text += f"*Source: [arXiv:{arxiv_id}](https://arxiv.org/abs/{arxiv_id})*\n\n" # Add submission date if available if submission_date: markdown_text += f"*[Submitted on {submission_date}]*\n\n" # Add the content content_text = "\n\n".join(markdown_contents) markdown_text += content_text # Add link to BibTeX if available at the bottom if bibtex_content: markdown_text += f"\n\n---\n*[BibTeX citation]({sanitized_title}.bib)*\n" # Post-processing: Remove duplicate title if present lines = markdown_text.split('\n') if len(lines) >= 3: if lines[0].strip().startswith('#') and lines[2].strip().startswith('#') and lines[0].strip() == lines[2].strip(): # Remove duplicate title lines.pop(2) markdown_text = '\n'.join(lines) # Handle image references for img_id, img_src in image_map.items(): pattern = r"!\[(.*?)\]\(\s*" + re.escape(img_id) + r"\s*\)" replacement = r"" markdown_text = re.sub(pattern, replacement, markdown_text) if html: # Convert markdown to HTML md = markdown.Markdown(extensions=["tables"]) html_content = md.convert(markdown_text) # Add HTML wrapper with basic styling result = f"""<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>{paper_title}</title> <style> body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 0 auto; max-width: 800px; padding: 20px; }} img {{ max-width: 100%; height: auto; }} h1, h2, h3 {{ margin-top: 1.5em; }} p {{ margin: 1em 0; }} </style> </head> <body> {html_content} </body> </html>""" else: # markdown result = markdown_text # Write output to file output_file.write_text(result) if not silent: click.echo(f"Results saved to {output_file}", err=True) if bibtex_content: click.echo(f"BibTeX citation saved to {bibtex_file}", err=True) click.echo(f"Original arXiv URL: {arxiv_url}", err=True) click.echo(f"PDF URL: https://arxiv.org/pdf/{arxiv_id}.pdf", err=True) finally: # Clean up uploaded file if uploaded_file: try: client.files.delete(file_id=uploaded_file.id) if not silent: click.echo("Temporary Mistral file deleted", err=True) except Exception as e: if not silent: click.echo(f"Warning: Could not delete temporary Mistral file: {str(e)}", err=True) finally: # Clean up temp file if temp_pdf_path.exists(): os.unlink(temp_pdf_path) if not silent: click.echo("Temporary PDF file deleted", err=True) except Exception as e: raise click.ClickException(f"Error: {str(e)}") if __name__ == "__main__": arxiv_to_markdown()