"""
kindle_highlights.py
This script provides functionality to parse and process Kindle highlights and notes.
It extracts data from a Kindle clippings file and organizes it into a structured format.
Usage:
from kindle_highlights import KindleClippingCollection
collection = KindleClippingCollection.extract_from_file("path/to/My Clippings.txt")
collection.save_as_files("output_directory")
"""
from pydantic import BaseModel, field_validator, ValidationError
from typing import Optional, Literal, Dict, List
from datetime import datetime
import re
import logging
from os import path
class KindleClipping(BaseModel):
title: str
author: str
type: Literal["Highlight", "Note", "Bookmark"]
page: Optional[int | str]
location_start: int
location_end: Optional[int]
date: datetime
content: str
@field_validator("date", mode="before")
def parse_date(cls, value):
return datetime.strptime(value, "%A, %d %B %Y %H:%M:%S")
def to_location_range(self) -> str:
if self.location_end:
return f"{self.location_start}-{self.location_end}"
else:
return str(self.location_start)
def to_position(self) -> str:
if self.page:
return f"page {self.page}, loc. {self.to_location_range()}"
else:
return f"loc. {self.to_location_range()}"
def to_markdown(self) -> str:
if self.type == "Highlight":
return f"* {self.content}\n - {self.to_position()}"
elif self.type == "Note":
return f" - **COMMENT**: {self.content}"
elif self.type == "Bookmark":
return f"* **BOOKMARK**: {self.content} {self.to_position()}"
else:
raise ValueError(f"Unknown type: {self.type}")
class KindleClippingCollection:
def __init__(self, books: Dict[str, List[KindleClipping]]):
self.books = books
@classmethod
def group_by_book_title(
cls, clippings: List[KindleClipping]
) -> Dict[str, List[KindleClipping]]:
books = {}
for clipping in clippings:
if clipping.title not in books:
books[clipping.title] = []
books[clipping.title].append(clipping)
return cls(books)
@classmethod
def extract_from_file(
cls, file_path: str, clear_highlights: bool = True
) -> "KindleClippingCollection":
with open(file_path, "r", encoding="utf-8-sig") as file:
text = file.read()
parts = text.split("\n==========\n")
pattern = re.compile(
r"^(?P
.*) \((?P.*)\)\n- Your (?PHighlight|Note|Bookmark)(?: on page (?P[\w\d]+) \|)?(?: at)? location (?P\d+)(?:-(?P\d+))? \| Added on (?P.*)\n+(?P(?:.|\n)*)",
re.MULTILINE,
)
clippings: List[KindleClipping] = []
for part in parts:
match = pattern.match(part)
if match:
try:
clippings.append(KindleClipping(**match.groupdict()))
except ValidationError as e:
logging.error(f"Error parsing clipping ({e}):\n{part}")
else:
logging.error(f"Error parsing clipping (no match):\n{part}")
res = cls.group_by_book_title(clippings)
if clear_highlights:
res = res.clear_highlights_all()
return res
@staticmethod
def clear_highlights(clippings: List[KindleClipping]) -> List[KindleClipping]:
last = clippings[0]
filtered = [last]
for clipping in clippings[1:]:
if (
clipping.type == "Highlight"
and clipping.location_start == last.location_start
):
filtered[-1] = clipping
logging.info(
f"Highlight removed as it seems to be updated.\nOLD:{last.content}\nNEW:{clipping.content}"
)
else:
filtered.append(clipping)
last = clipping
return filtered
def clear_highlights_all(self) -> "KindleClippingCollection":
return KindleClippingCollection(
{
title: self.clear_highlights(clippings)
for title, clippings in self.books.items()
}
)
@staticmethod
def book_to_markdown(clippings: List[KindleClipping], as_file: bool = True) -> str:
quotations = "\n".join([clipping.to_markdown() for clipping in clippings])
title = clippings[0].title
author = clippings[0].author
dates = [clipping.date for clipping in clippings]
date_first = min(dates).strftime("%Y-%m-%d")
date_last = max(dates).strftime("%Y-%m-%d")
if as_file:
return f"""---
title: "{title}"
author: {author}
date_first: {date_first}
date_last: {date_last}
---
## Quotations
{quotations}"""
else:
return f"""## {title} by {author}
Highlights from {min(dates).strftime('%Y-%m-%d')} to {max(dates).strftime('%Y-%m-%d')}
{quotations}"""
def save_as_files(self, output_dir: str, min_highlights_for_separate_file: int = 5):
others = []
for title, clippings in self.books.items():
if len(clippings) >= min_highlights_for_separate_file:
filename = re.sub(r'[<>:"/\\|?*]', "", title)
with open(path.join(output_dir, f"{filename}.md"), "w", encoding="utf-8") as file:
file.write(self.book_to_markdown(clippings, as_file=True))
else:
others.append(self.book_to_markdown(clippings, as_file=False))
with open(path.join(output_dir, "other.md"), "w", encoding="utf-8") as file:
file.write("\n\n".join(others))