# New TDM client demo

### Download and filter metadata with Pandas

In [None]:
# Parameters:
dataset_id = "943b499d-2d00-e422-095f-97274a8b2121"


In [None]:
# Importing your dataset with a dataset ID
import tdm_client

dataset_metadata = tdm_client.get_metadata(dataset_id)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(dataset_metadata)

In [None]:
dataset_document_count = len(df)
print("Total documents", dataset_document_count)

In [None]:
# Set the pandas option to show all columns
pd.set_option("max_columns", None) 

df.head() # Show the first five rows of our DataFrame

In [None]:
id_list = df['id'].tolist()

In [None]:
'http://www.jstor.org/stable/2871420' in id_list

In [None]:
# Drop each of these named columns
df = df.drop(['outputFormat', 'pageEnd', 'pageStart', 'datePublished', 'language'], axis=1)

In [None]:
# Drop articles without an author
df = df.dropna(subset=['creator'])

In [None]:
print("Original total", dataset_document_count)
print("Filtered total", len(df))

In [None]:
# Examples for filtering the data based on the values found under 'title'

df = df[df.title != 'Review Article'] # Remove articles with title "Review Article"
df = df[df.title != 'Front Matter'] # Remove articles with title "Front Matter"
df = df[df.title != 'Back Matter'] # Remove articles with title "Back Matter"

# Remove articles with fewer than 3000 words, adjust or remove

df = df[df.wordCount > 3000] 

In [None]:
print("Original total", dataset_document_count)
print("Filtered total", len(df))

In [None]:
filtered_id_list = df["id"].tolist()

In [None]:
df.groupby(['publicationYear'])['id'].agg('count').plot.bar(title='Documents by year', figsize=(20, 5), fontsize=12); 

In [None]:
df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);

### Count word frequencies

In [None]:
dataset_json_file = tdm_client.get_dataset(dataset_id)

In [None]:
import json
import gzip
from collections import Counter

word_frequency = Counter()

In [None]:
with gzip.open(dataset_json_file, "rb") as input_file:
 for row in input_file:
 document = json.loads(row)
 _id = document["id"]
 if _id in filtered_id_list:
 unigrams = document.get("unigramCount", [])
 for gram, count in unigrams.items():
 word_frequency[gram] += count

In [None]:
for gram, count in word_frequency.most_common(25):
 print(gram.ljust(20), count)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words[:10]

In [None]:
transformed_word_frequency = Counter()

In [None]:
for document in tdm_client.dataset_reader(dataset_json_file):
 _id = document["id"]
 if _id in filtered_id_list:
 unigrams = document.get("unigramCount", [])
 for gram, count in unigrams.items():
 clean_gram = gram.lower()
 if clean_gram in stop_words:
 continue
 transformed_word_frequency[clean_gram] += count
 break

In [None]:
for gram, count in transformed_word_frequency.most_common(25):
 print(gram.ljust(20), count)

In [None]:
df = pd.DataFrame(list(transformed_word_frequency.items())[:25], columns=["ngram", "count"])

In [None]:
df.head()

In [None]:
df.sort_values('count', ascending=True).plot.barh(title='Frequent words', figsize=(20, 10), fontsize=12, x="ngram", y="count");

### Significant terms

Run TFIDF on the first 10 documents in the filtered corpus.

In [None]:
import gensim

In [None]:
# Limit to n documents. Set to None to do all

limit = 500

n = 0
documents = []
for document in tdm_client.dataset_reader(dataset_json_file):
 processed_document = []
 _id = document["id"]
 if _id in filtered_id_list:
 unigrams = document.get("unigramCount", [])
 for gram, count in unigrams.items():
 clean_gram = process_token(gram)
 if clean_gram is None:
 continue
 processed_document.append(clean_gram)
 if len(processed_document) > 0:
 documents.append(processed_document)
 n += 1
 if (limit is not None) and (n >= limit):
 break

dictionary = gensim.corpora.Dictionary(documents)

In [None]:
def process_token(token):
 token = token.lower()
 if token in stop_words:
 return
 if len(token) < 4:
 return
 if not(token.isalpha()):
 return
 return token

dictionary = gensim.corpora.Dictionary(documents)

In [None]:
dict(list(dictionary.token2id.items())[0:10]) # Print the first ten tokens and their associated IDs.


In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents]

In [None]:
model = gensim.models.TfidfModel(bow_corpus)

In [None]:
corpus_tfidf = model[bow_corpus]

In [None]:
rows = []
for doc in corpus_tfidf:
 for term_id, score in doc:
 rows.append([dictionary.get(term_id), score])

In [None]:
df = pd.DataFrame(rows, columns=["ngram", "score"])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
top_50 = df.sort_values("score", ascending=False).head(n=50)

In [None]:
top_50.sort_values("score", ascending=True).plot.barh(title='Significant terms', figsize=(20, 10), fontsize=12, x="ngram", y="score");

 ### LDA topic modeling

In [None]:
doc_count = len(id_list)
num_topics = 7 # Change the number of topics

# Remove terms that appear in less than 10% of documents and more than 75% of documents.
dictionary.filter_extremes(no_below=10 * .10, no_above=0.75)


In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents]

In [None]:
# Train the LDA model.
model = gensim.models.LdaModel(
 corpus=bow_corpus,
 id2word=dictionary,
 num_topics=num_topics
)

In [None]:
for topic_num in range(0, num_topics):
 word_ids = model.get_topic_terms(topic_num)
 words = []
 for wid, weight in word_ids:
 word = dictionary.id2token[wid]
 words.append(word)
 print("Topic {}".format(str(topic_num).ljust(5)), " ".join(words))