# Topic modeling journal runs

An example notebook looking at articles from a single journal, in this case Library History and its variants. 

Process:
* build dataset in corpus builder
* download dataset to notebook environment
* use ngrams to build a topic model 
* use the model to infer topics for each article
* track topic frequency over time
* plot the results

The Python library gensim is used for LDA topic modeling.


In [None]:
!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history

In [None]:
import json
import gzip
from collections import Counter
from pprint import pprint

import gensim
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordfreq import simple_tokenize

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

logging.getLogger('gensim.models').setLevel(logging.WARN)

In [None]:
dataset_filename = "datasets/library-history.jsonl.gz"

## Processing

Define functions to:
 - process individual tokens
 - process ngrams
 - convert a TDM document to a gensim "bag of words"
    

In [None]:
def process_token(token, strip_stopwords=True):
    token = " ".join(simple_tokenize(token))
    if len(token) < 3:
        return
    return token

In [None]:
assert process_token("Title,") == "title"

In [None]:
def process_ngram(token):
    token = simple_tokenize(token)
    return "_".join(token)

In [None]:

def doc_to_bow(raw):
    bow_doc = []
    ngrams = document.get("unigramCount", {})
    for gram, count in ngrams.items():
        cg = process_token(gram)
        if (cg is None) or len(cg) == 0:
            continue
        else:
            #bow_doc += [cg] * count
            bow_doc.append(cg)
    for ngram, ngram_len in [("bigramCount", 2), ("trigramCount", 3)]:
        for gram, count in document.get(ngram, {}).items():
            #if count > 1:
            #    continue
            clean_gram = process_ngram(gram)
            if (clean_gram is None) or len(clean_gram) == 0:
                continue
            #bow_doc += [clean_gram] * count 
            bow_doc.append(clean_gram)
    if len(bow_doc) == 0:
        return
    return bow_doc

## Build the corpus

Read each document in our dataset, process the ngrams, and convert to a list of documents

In [None]:
# Limit to n documents. Set to None to do all
limit = None
num_docs = 0

documents = []
metadata = {}

with gzip.open(dataset_filename, "rb") as inf:
    for idx, row in enumerate(inf):
        document = json.loads(row)
        _id = document["id"]
        bd = doc_to_bow(document)
        metadata[idx] = {
            "year": document["publicationYear"],
            "id": _id
        }
        if bd is None:
            print(_id)
            continue
        else:
            documents.append(bd)
            num_docs += 1
        if (limit is not None) and (num_docs >= limit):
           break

In [None]:
len(documents)

In [None]:
dictionary = gensim.corpora.Dictionary(documents)

In [None]:
print('Number of unique tokens: %d' % len(dictionary))

In [None]:
# Remove terms that appear in less than 20 of and more than 50% of documents. 
dictionary.filter_extremes(no_below=5, no_above=0.50)
print('Number of unique tokens: %d' % len(dictionary))

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents]

In [None]:
print('Number of documents: %d' % len(bow_corpus))

## Train the model

Run our bow corpus through the LDA model and print the identified topics with the terms.

In [None]:
#logging.getLogger('gensim.models').setLevel(logging.ERROR)

num_topics = 3
passes = 50
iterations = 700
eval_every = None

# Train the LDA model.
model = gensim.models.LdaModel(
    corpus=bow_corpus,
    id2word=dictionary,
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
for topic_num in range(0, num_topics):
    word_ids = model.get_topic_terms(topic_num)
    words = []
    for wid, weight in word_ids:
        word = dictionary.id2token[wid]
        words.append(word)
    print("Topic {}".format(str(topic_num + 1).ljust(5)), " ".join(words))

## Track topic changes over time

Run each document through the model to identify the topics per document per year

In [None]:
year_to_topic = {}
year_count = Counter()
rows = []

for idx, meta in metadata.items():
    year = meta["year"]
    cdoc = bow_corpus[idx]
    topics = model.get_document_topics(cdoc)
    for topic, score in topics:
        cnt = year_to_topic.get(year, Counter())
        cnt[topic] += 1
        year_to_topic[year] = cnt
        year_count[year] += 1

In [None]:
rows = []
for yr, cnt in year_to_topic.items():
    for topic, count in cnt.items():
        rows.append((yr, topic + 1, count))

In [None]:
df = pd.DataFrame(rows, columns=["year", "topic_num", "n"])

In [None]:
def yearly_frequency(row):
    return row["n"] / year_count[row["year"]]

In [None]:
df["tf"] = df.apply(yearly_frequency, axis=1)

## Results

In [None]:
plt = sns.lmplot(
    x="year",
    y="tf", 
    data=df, 
    hue="topic_num",
    ci=None,
    palette=sns.color_palette("muted", n_colors=num_topics)
);