from pathlib import Path import re import dspy from typing import TypedDict, List import yaml # Set up DSPy and the LM lm = dspy.LM('anthropic/claude-3-5-haiku-latest', api_key='YOUR_API_KEY') dspy.configure(lm=lm) # # Get the initial list of terms # # Define the Term object we want returned class Term(BaseModel): term: str = dspy.OutputField(desc="A glossary term, like: a technical term specific to the subject matter, a concept crucial to understanding an article's main ideas, a term explicitly defined or explained in a post, or a word or phrase that are frequently used or emphasized in the post. Do not include the abbreviation in the 'term' field.") abbreviation: str = dspy.OutputField(desc="Populate the abbreviation field if the term is abbreviated in the article, ensure that it is not pluralized. If there is no abbreviation, populate the abbreviation field with an empty string.") definition: str = dspy.OutputField(desc="A definition of the term. Lightly edit the definition so it can stand alone outside the context of the post, but ensure that you do not add any information that is not present in the original text.") details: str = dspy.OutputField(desc="Text from the post that expounds a bit on the term, adding texture and details beyond the definition. The 'details' field can be empty if there is no additional context to provide and multiple paragraphs if there is more than one piece of context to provide.") synonyms: List[str] = dspy.OutputField(desc="Any synonyms, acronyms, or alternative terms that are used in the post") # Find key terms for the post and terms where their definition might not be clear to the reader class ExtractTerms(dspy.Signature): """Find key terms for the post and terms where their definition might not be clear to the reader, from a markdown blog post. Ignore all text between markdown code blocks.""" post: str = dspy.InputField(desc="the markdown blog post") terms: List[Term] = dspy.OutputField(desc="Array of glossary terms.") extractTerms = dspy.Predict(ExtractTerms) # Get the terms from the posts posts_path = Path("../_posts") glossary = [] for post_file in sorted(posts_path.glob('*.md')): print(f"Processing {post_file}") with open(post_file, 'r') as f: post_content = f.read() post_content = re.split(r'\n---\n', post_content, maxsplit=2)[-1] try: terms = extractTerms(post=post_content) if 'glossary' not in globals(): glossary = [] except Exception as e: print(f"Failed to process {post_file}: {e}") continue for term in terms.terms: if term not in glossary: if str(post_file).startswith('../'): term['path'] = str(post_file)[3:] else: term['path'] = post_file print(f"Adding term {term}") glossary.append(term) break # # Condense the glossary to unique terms # # Compare two term dicts to see if they are the same term def compare_terms(term1, term2): if term1['term'].lower() == term2['term'].lower(): return True if any(syn.lower() in [s.lower() for s in term2['synonyms']] for syn in term1['synonyms']): return True if term1['term'].lower() in [s.lower() for s in term2['synonyms']]: return True return False # Condense the glossary by finding identical terms and merging their definitions, details, and synonyns. merged_glossary = {} for term in glossary: found = False for key in merged_glossary: if compare_terms(term, merged_glossary[key]): found = True merged_glossary[key]['details'] += "\n\n" + term['details'] merged_glossary[key]['synonyms'] += term['synonyms'] merged_glossary[key]['pages'].append(term['path']) merged_glossary[key]['synonyms'] = list(set(merged_glossary[key]['synonyms'])) break if not found: page = term['path'] term['pages'] = [page] del term['path'] merged_glossary[term['term']] = term # Sort the merged_glossary by keys sorted_glossary = dict(sorted(merged_glossary.items())) # Create the _data directory if it doesn't exist Path("../_data").mkdir(parents=True, exist_ok=True) # Write the sorted glossary values to a YAML file with open('../_data/glossary.yaml', 'w') as yaml_file: yaml.dump(list(sorted_glossary.values()), yaml_file, default_flow_style=False, sort_keys=False)