{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic modeling journal runs\n", "\n", "An example notebook looking at articles from a single journal, in this case Library History and its variants. \n", "\n", "Process:\n", "* build dataset in corpus builder\n", "* download dataset to notebook environment\n", "* use ngrams to build a topic model \n", "* use the model to infer topics for each article\n", "* track topic frequency over time\n", "* plot the results\n", "\n", "The Python library gensim is used for LDA topic modeling.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "!bash getDataset 5c54351f-d2fa-749f-3efc-0477720bd176 library-history" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import gzip\n", "from collections import Counter\n", "from pprint import pprint\n", "\n", "import gensim\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from wordfreq import simple_tokenize" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)\n", "\n", "logging.getLogger('gensim.models').setLevel(logging.WARN)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_filename = \"datasets/library-history.jsonl.gz\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Processing\n", "\n", "Define functions to:\n", " - process individual tokens\n", " - process ngrams\n", " - convert a TDM document to a gensim \"bag of words\"\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_token(token, strip_stopwords=True):\n", " token = \" \".join(simple_tokenize(token))\n", " if len(token) < 3:\n", " return\n", " return token" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "assert process_token(\"Title,\") == \"title\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_ngram(token):\n", " token = simple_tokenize(token)\n", " return \"_\".join(token)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "def doc_to_bow(raw):\n", " bow_doc = []\n", " ngrams = document.get(\"unigramCount\", {})\n", " for gram, count in ngrams.items():\n", " cg = process_token(gram)\n", " if (cg is None) or len(cg) == 0:\n", " continue\n", " else:\n", " #bow_doc += [cg] * count\n", " bow_doc.append(cg)\n", " for ngram, ngram_len in [(\"bigramCount\", 2), (\"trigramCount\", 3)]:\n", " for gram, count in document.get(ngram, {}).items():\n", " #if count > 1:\n", " # continue\n", " clean_gram = process_ngram(gram)\n", " if (clean_gram is None) or len(clean_gram) == 0:\n", " continue\n", " #bow_doc += [clean_gram] * count \n", " bow_doc.append(clean_gram)\n", " if len(bow_doc) == 0:\n", " return\n", " return bow_doc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build the corpus\n", "\n", "Read each document in our dataset, process the ngrams, and convert to a list of documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Limit to n documents. Set to None to do all\n", "limit = None\n", "num_docs = 0\n", "\n", "documents = []\n", "metadata = {}\n", "\n", "with gzip.open(dataset_filename, \"rb\") as inf:\n", " for idx, row in enumerate(inf):\n", " document = json.loads(row)\n", " _id = document[\"id\"]\n", " bd = doc_to_bow(document)\n", " metadata[idx] = {\n", " \"year\": document[\"publicationYear\"],\n", " \"id\": _id\n", " }\n", " if bd is None:\n", " print(_id)\n", " continue\n", " else:\n", " documents.append(bd)\n", " num_docs += 1\n", " if (limit is not None) and (num_docs >= limit):\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dictionary = gensim.corpora.Dictionary(documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Number of unique tokens: %d' % len(dictionary))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Remove terms that appear in less than 20 of and more than 50% of documents. \n", "dictionary.filter_extremes(no_below=5, no_above=0.50)\n", "print('Number of unique tokens: %d' % len(dictionary))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print('Number of documents: %d' % len(bow_corpus))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train the model\n", "\n", "Run our bow corpus through the LDA model and print the identified topics with the terms." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#logging.getLogger('gensim.models').setLevel(logging.ERROR)\n", "\n", "num_topics = 3\n", "passes = 50\n", "iterations = 700\n", "eval_every = None\n", "\n", "# Train the LDA model.\n", "model = gensim.models.LdaModel(\n", " corpus=bow_corpus,\n", " id2word=dictionary,\n", " iterations=iterations,\n", " num_topics=num_topics,\n", " passes=passes,\n", " eval_every=eval_every\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "for topic_num in range(0, num_topics):\n", " word_ids = model.get_topic_terms(topic_num)\n", " words = []\n", " for wid, weight in word_ids:\n", " word = dictionary.id2token[wid]\n", " words.append(word)\n", " print(\"Topic {}\".format(str(topic_num + 1).ljust(5)), \" \".join(words))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Track topic changes over time\n", "\n", "Run each document through the model to identify the topics per document per year" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "year_to_topic = {}\n", "year_count = Counter()\n", "rows = []\n", "\n", "for idx, meta in metadata.items():\n", " year = meta[\"year\"]\n", " cdoc = bow_corpus[idx]\n", " topics = model.get_document_topics(cdoc)\n", " for topic, score in topics:\n", " cnt = year_to_topic.get(year, Counter())\n", " cnt[topic] += 1\n", " year_to_topic[year] = cnt\n", " year_count[year] += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for yr, cnt in year_to_topic.items():\n", " for topic, count in cnt.items():\n", " rows.append((yr, topic + 1, count))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(rows, columns=[\"year\", \"topic_num\", \"n\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def yearly_frequency(row):\n", " return row[\"n\"] / year_count[row[\"year\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"tf\"] = df.apply(yearly_frequency, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "plt = sns.lmplot(\n", " x=\"year\",\n", " y=\"tf\", \n", " data=df, \n", " hue=\"topic_num\",\n", " ci=None,\n", " palette=sns.color_palette(\"muted\", n_colors=num_topics)\n", ");" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": null }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "211.188px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }