{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# New TDM client demo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download and filter metadata with Pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Parameters:\n", "dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Importing your dataset with a dataset ID\n", "import tdm_client\n", "\n", "dataset_metadata = tdm_client.get_metadata(dataset_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(dataset_metadata)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_document_count = len(df)\n", "print(\"Total documents\", dataset_document_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set the pandas option to show all columns\n", "pd.set_option(\"max_columns\", None) \n", "\n", "df.head() # Show the first five rows of our DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "id_list = df['id'].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'http://www.jstor.org/stable/2871420' in id_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Drop each of these named columns\n", "df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Drop articles without an author\n", "df = df.dropna(subset=['creator'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Original total\", dataset_document_count)\n", "print(\"Filtered total\", len(df))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Examples for filtering the data based on the values found under 'title'\n", "\n", "df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n", "df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n", "df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n", "\n", "# Remove articles with fewer than 3000 words, adjust or remove\n", "\n", "df = df[df.wordCount > 3000] " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(\"Original total\", dataset_document_count)\n", "print(\"Filtered total\", len(df))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filtered_id_list = df[\"id\"].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count word frequencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_json_file = tdm_client.get_dataset(dataset_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import gzip\n", "from collections import Counter\n", "\n", "word_frequency = Counter()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with gzip.open(dataset_json_file, \"rb\") as input_file:\n", " for row in input_file:\n", " document = json.loads(row)\n", " _id = document[\"id\"]\n", " if _id in filtered_id_list:\n", " unigrams = document.get(\"unigramCount\", [])\n", " for gram, count in unigrams.items():\n", " word_frequency[gram] += count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for gram, count in word_frequency.most_common(25):\n", " print(gram.ljust(20), count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = stopwords.words('english')\n", "stop_words[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "transformed_word_frequency = Counter()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for document in tdm_client.dataset_reader(dataset_json_file):\n", " _id = document[\"id\"]\n", " if _id in filtered_id_list:\n", " unigrams = document.get(\"unigramCount\", [])\n", " for gram, count in unigrams.items():\n", " clean_gram = gram.lower()\n", " if clean_gram in stop_words:\n", " continue\n", " transformed_word_frequency[clean_gram] += count\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for gram, count in transformed_word_frequency.most_common(25):\n", " print(gram.ljust(20), count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Significant terms\n", "\n", "Run TFIDF on the first 10 documents in the filtered corpus." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gensim" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Limit to n documents. Set to None to do all\n", "\n", "limit = 500\n", "\n", "n = 0\n", "documents = []\n", "for document in tdm_client.dataset_reader(dataset_json_file):\n", " processed_document = []\n", " _id = document[\"id\"]\n", " if _id in filtered_id_list:\n", " unigrams = document.get(\"unigramCount\", [])\n", " for gram, count in unigrams.items():\n", " clean_gram = process_token(gram)\n", " if clean_gram is None:\n", " continue\n", " processed_document.append(clean_gram)\n", " if len(processed_document) > 0:\n", " documents.append(processed_document)\n", " n += 1\n", " if (limit is not None) and (n >= limit):\n", " break\n", "\n", "dictionary = gensim.corpora.Dictionary(documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_token(token):\n", " token = token.lower()\n", " if token in stop_words:\n", " return\n", " if len(token) < 4:\n", " return\n", " if not(token.isalpha()):\n", " return\n", " return token\n", "\n", "dictionary = gensim.corpora.Dictionary(documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = gensim.models.TfidfModel(bow_corpus)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus_tfidf = model[bow_corpus]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for doc in corpus_tfidf:\n", " for term_id, score in doc:\n", " rows.append([dictionary.get(term_id), score])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50 = df.sort_values(\"score\", ascending=False).head(n=50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ### LDA topic modeling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_count = len(id_list)\n", "num_topics = 7 # Change the number of topics\n", "\n", "# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n", "dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Train the LDA model.\n", "model = gensim.models.LdaModel(\n", " corpus=bow_corpus,\n", " id2word=dictionary,\n", " num_topics=num_topics\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for topic_num in range(0, num_topics):\n", " word_ids = model.get_topic_terms(topic_num)\n", " words = []\n", " for wid, weight in word_ids:\n", " word = dictionary.id2token[wid]\n", " words.append(word)\n", " print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "223.188px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }