{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# New TDM client demo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download and filter metadata with Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters:\n",
    "dataset_id = \"943b499d-2d00-e422-095f-97274a8b2121\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing your dataset with a dataset ID\n",
    "import tdm_client\n",
    "\n",
    "dataset_metadata = tdm_client.get_metadata(dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(dataset_metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_document_count = len(df)\n",
    "print(\"Total documents\", dataset_document_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the pandas option to show all columns\n",
    "pd.set_option(\"max_columns\", None) \n",
    "\n",
    "df.head() # Show the first five rows of our DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_list = df['id'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'http://www.jstor.org/stable/2871420' in id_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop each of these named columns\n",
    "df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop articles without an author\n",
    "df = df.dropna(subset=['creator'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Original total\", dataset_document_count)\n",
    "print(\"Filtered total\", len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Examples for filtering the data based on the values found under 'title'\n",
    "\n",
    "df = df[df.title != 'Review Article'] # Remove articles with title \"Review Article\"\n",
    "df = df[df.title != 'Front Matter'] # Remove articles with title \"Front Matter\"\n",
    "df = df[df.title != 'Back Matter'] # Remove articles with title \"Back Matter\"\n",
    "\n",
    "# Remove articles with fewer than 3000 words, adjust or remove\n",
    "\n",
    "df = df[df.wordCount > 3000] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(\"Original total\", dataset_document_count)\n",
    "print(\"Filtered total\", len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_id_list = df[\"id\"].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count word frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_json_file = tdm_client.get_dataset(dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import gzip\n",
    "from collections import Counter\n",
    "\n",
    "word_frequency = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with gzip.open(dataset_json_file, \"rb\") as input_file:\n",
    "    for row in input_file:\n",
    "        document = json.loads(row)\n",
    "        _id = document[\"id\"]\n",
    "        if _id in filtered_id_list:\n",
    "            unigrams = document.get(\"unigramCount\", [])\n",
    "            for gram, count in unigrams.items():\n",
    "                word_frequency[gram] += count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for gram, count in word_frequency.most_common(25):\n",
    "    print(gram.ljust(20), count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = stopwords.words('english')\n",
    "stop_words[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "transformed_word_frequency = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for document in tdm_client.dataset_reader(dataset_json_file):\n",
    "    _id = document[\"id\"]\n",
    "    if _id in filtered_id_list:\n",
    "        unigrams = document.get(\"unigramCount\", [])\n",
    "        for gram, count in unigrams.items():\n",
    "            clean_gram = gram.lower()\n",
    "            if clean_gram in stop_words:\n",
    "                continue\n",
    "            transformed_word_frequency[clean_gram] += count\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for gram, count in transformed_word_frequency.most_common(25):\n",
    "    print(gram.ljust(20), count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=[\"ngram\", \"count\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"count\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Significant terms\n",
    "\n",
    "Run TFIDF on the first 10 documents in the filtered corpus."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Limit to n documents. Set to None to do all\n",
    "\n",
    "limit = 500\n",
    "\n",
    "n = 0\n",
    "documents = []\n",
    "for document in tdm_client.dataset_reader(dataset_json_file):\n",
    "    processed_document = []\n",
    "    _id = document[\"id\"]\n",
    "    if _id in filtered_id_list:\n",
    "        unigrams = document.get(\"unigramCount\", [])\n",
    "        for gram, count in unigrams.items():\n",
    "            clean_gram = process_token(gram)\n",
    "            if clean_gram is None:\n",
    "                continue\n",
    "            processed_document.append(clean_gram)\n",
    "        if len(processed_document) > 0:\n",
    "            documents.append(processed_document)\n",
    "        n += 1\n",
    "        if (limit is not None) and (n >= limit):\n",
    "           break\n",
    "\n",
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_token(token):\n",
    "    token = token.lower()\n",
    "    if token in stop_words:\n",
    "        return\n",
    "    if len(token) < 4:\n",
    "        return\n",
    "    if not(token.isalpha()):\n",
    "        return\n",
    "    return token\n",
    "\n",
    "dictionary = gensim.corpora.Dictionary(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = gensim.models.TfidfModel(bow_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_tfidf = model[bow_corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for doc in corpus_tfidf:\n",
    "    for term_id, score in doc:\n",
    "        rows.append([dictionary.get(term_id), score])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(rows, columns=[\"ngram\", \"score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_50 = df.sort_values(\"score\", ascending=False).head(n=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_50.sort_values(\"score\", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x=\"ngram\", y=\"score\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " ### LDA topic modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_count = len(id_list)\n",
    "num_topics = 7 # Change the number of topics\n",
    "\n",
    "# Remove terms that appear in less than 10% of documents and more than 75% of documents.\n",
    "dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the LDA model.\n",
    "model = gensim.models.LdaModel(\n",
    "    corpus=bow_corpus,\n",
    "    id2word=dictionary,\n",
    "    num_topics=num_topics\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for topic_num in range(0, num_topics):\n",
    "    word_ids = model.get_topic_terms(topic_num)\n",
    "    words = []\n",
    "    for wid, weight in word_ids:\n",
    "        word = dictionary.id2token[wid]\n",
    "        words.append(word)\n",
    "    print(\"Topic {}\".format(str(topic_num).ljust(5)), \" \".join(words))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "223.188px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}