""" kindle_highlights.py This script provides functionality to parse and process Kindle highlights and notes. It extracts data from a Kindle clippings file and organizes it into a structured format. Usage: from kindle_highlights import KindleClippingCollection collection = KindleClippingCollection.extract_from_file("path/to/My Clippings.txt") collection.save_as_files("output_directory") """ from pydantic import BaseModel, field_validator, ValidationError from typing import Optional, Literal, Dict, List from datetime import datetime import re import logging from os import path class KindleClipping(BaseModel): title: str author: str type: Literal["Highlight", "Note", "Bookmark"] page: Optional[int | str] location_start: int location_end: Optional[int] date: datetime content: str @field_validator("date", mode="before") def parse_date(cls, value): return datetime.strptime(value, "%A, %d %B %Y %H:%M:%S") def to_location_range(self) -> str: if self.location_end: return f"{self.location_start}-{self.location_end}" else: return str(self.location_start) def to_position(self) -> str: if self.page: return f"page {self.page}, loc. {self.to_location_range()}" else: return f"loc. {self.to_location_range()}" def to_markdown(self) -> str: if self.type == "Highlight": return f"* {self.content}\n - {self.to_position()}" elif self.type == "Note": return f" - **COMMENT**: {self.content}" elif self.type == "Bookmark": return f"* **BOOKMARK**: {self.content} {self.to_position()}" else: raise ValueError(f"Unknown type: {self.type}") class KindleClippingCollection: def __init__(self, books: Dict[str, List[KindleClipping]]): self.books = books @classmethod def group_by_book_title( cls, clippings: List[KindleClipping] ) -> Dict[str, List[KindleClipping]]: books = {} for clipping in clippings: if clipping.title not in books: books[clipping.title] = [] books[clipping.title].append(clipping) return cls(books) @classmethod def extract_from_file( cls, file_path: str, clear_highlights: bool = True ) -> "KindleClippingCollection": with open(file_path, "r", encoding="utf-8-sig") as file: text = file.read() parts = text.split("\n==========\n") pattern = re.compile( r"^(?P.*) \((?P<author>.*)\)\n- Your (?P<type>Highlight|Note|Bookmark)(?: on page (?P<page>[\w\d]+) \|)?(?: at)? location (?P<location_start>\d+)(?:-(?P<location_end>\d+))? \| Added on (?P<date>.*)\n+(?P<content>(?:.|\n)*)", re.MULTILINE, ) clippings: List[KindleClipping] = [] for part in parts: match = pattern.match(part) if match: try: clippings.append(KindleClipping(**match.groupdict())) except ValidationError as e: logging.error(f"Error parsing clipping ({e}):\n{part}") else: logging.error(f"Error parsing clipping (no match):\n{part}") res = cls.group_by_book_title(clippings) if clear_highlights: res = res.clear_highlights_all() return res @staticmethod def clear_highlights(clippings: List[KindleClipping]) -> List[KindleClipping]: last = clippings[0] filtered = [last] for clipping in clippings[1:]: if ( clipping.type == "Highlight" and clipping.location_start == last.location_start ): filtered[-1] = clipping logging.info( f"Highlight removed as it seems to be updated.\nOLD:{last.content}\nNEW:{clipping.content}" ) else: filtered.append(clipping) last = clipping return filtered def clear_highlights_all(self) -> "KindleClippingCollection": return KindleClippingCollection( { title: self.clear_highlights(clippings) for title, clippings in self.books.items() } ) @staticmethod def book_to_markdown(clippings: List[KindleClipping], as_file: bool = True) -> str: quotations = "\n".join([clipping.to_markdown() for clipping in clippings]) title = clippings[0].title author = clippings[0].author dates = [clipping.date for clipping in clippings] date_first = min(dates).strftime("%Y-%m-%d") date_last = max(dates).strftime("%Y-%m-%d") if as_file: return f"""--- title: "{title}" author: {author} date_first: {date_first} date_last: {date_last} --- ## Quotations {quotations}""" else: return f"""## {title} by {author} Highlights from {min(dates).strftime('%Y-%m-%d')} to {max(dates).strftime('%Y-%m-%d')} {quotations}""" def save_as_files(self, output_dir: str, min_highlights_for_separate_file: int = 5): others = [] for title, clippings in self.books.items(): if len(clippings) >= min_highlights_for_separate_file: filename = re.sub(r'[<>:"/\\|?*]', "", title) with open(path.join(output_dir, f"{filename}.md"), "w", encoding="utf-8") as file: file.write(self.book_to_markdown(clippings, as_file=True)) else: others.append(self.book_to_markdown(clippings, as_file=False)) with open(path.join(output_dir, "other.md"), "w", encoding="utf-8") as file: file.write("\n\n".join(others))