dannguyen · February 10, 2022 16:45 · Mar 9, 2015 · Mar 9, 2015 · Mar 9, 2015 · Mar 9, 2015
diff --git a/more-abstracted-whatever.py → easypipe.py b/more-abstracted-whatever.py → easypipe.py
@@ -10,27 +10,35 @@
 
 
 import sys
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.feature_extraction.text import TfidfVectorizer
+
 from sklearn.datasets import load_files
 from sklearn.cross_validation import train_test_split
 from sklearn import metrics
 from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy
 
 
-## make a Multinomial-NB and CountVectorized pipeline
-
-def easy_pipeline(vect = 'tfidf', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
+## make a Multinomial-NB and CountVectorized pipeline by default
+## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier
+def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
   if vect == 'tfidf':
     V = TfidfVectorizer
   else:
     V = CountVectorizer
 
+  if clf == 'lsvc':
+    C = LinearSVC
+  else:
+    C = MultinomialNB
+
+
   pipeline = Pipeline([
       ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
-      ('clf', MultinomialNB()),
+      ('clf', C()),
   ])
   return pipeline
 

diff --git a/more-abstracted-whatever.py b/more-abstracted-whatever.py
@@ -1,12 +1,18 @@
 # some convenience functions here, nothing new
+'''
 # usage:
-#
-#  data_folder = "stuff/"
-#  dataset = load_files(data_folder, shuffle = False)
+from easypipe import easy_pipeline
+from easypipe import print_metrics
+data_folder = "data-hold/20news"
+p = easy_pipeline()
+print_metrics(p, data_folder)
+'''
+
 
 import sys
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.datasets import load_files
 from sklearn.cross_validation import train_test_split
 from sklearn import metrics
@@ -16,16 +22,22 @@
 
 ## make a Multinomial-NB and CountVectorized pipeline
 
-def easy_pipeline(min_df = 0.01, max_df = 0.95, stop_words = None):
+def easy_pipeline(vect = 'tfidf', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
+  if vect == 'tfidf':
+    V = TfidfVectorizer
+  else:
+    V = CountVectorizer
+
   pipeline = Pipeline([
-      ('vect', CountVectorizer(min_df=min_df, max_df=max_df, stop_words = stop_words)),
+      ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
       ('clf', MultinomialNB()),
   ])
   return pipeline
 
 ## Print the precision/recall/F1 numbers per label, and also
 ##  the top-10 most informative features per label
-def print_metrics(pipeline, dataset, test_size = 0.25):
+def print_metrics(pipeline, data_folder, test_size = 0.25):
+  dataset = load_files(data_folder, shuffle = False)
   docs_train, docs_test, y_train, y_test = train_test_split(
     dataset.data, dataset.target, test_size = test_size, random_state=None)
   pipeline.fit(docs_train, y_train)
@@ -40,4 +52,3 @@ def print_metrics(pipeline, dataset, test_size = 0.25):
               print("%s:    %s" % (class_label,
                     ", ".join(vect.get_feature_names()[j] for j in topt)))
 
-
diff --git a/more-abstracted-whatever.py b/more-abstracted-whatever.py
@@ -0,0 +1,43 @@
+# some convenience functions here, nothing new
+# usage:
+#
+#  data_folder = "stuff/"
+#  dataset = load_files(data_folder, shuffle = False)
+
+import sys
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.datasets import load_files
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+from sklearn.pipeline import Pipeline
+import numpy
+
+
+## make a Multinomial-NB and CountVectorized pipeline
+
+def easy_pipeline(min_df = 0.01, max_df = 0.95, stop_words = None):
+  pipeline = Pipeline([
+      ('vect', CountVectorizer(min_df=min_df, max_df=max_df, stop_words = stop_words)),
+      ('clf', MultinomialNB()),
+  ])
+  return pipeline
+
+## Print the precision/recall/F1 numbers per label, and also
+##  the top-10 most informative features per label
+def print_metrics(pipeline, dataset, test_size = 0.25):
+  docs_train, docs_test, y_train, y_test = train_test_split(
+    dataset.data, dataset.target, test_size = test_size, random_state=None)
+  pipeline.fit(docs_train, y_train)
+  y_predicted = pipeline.predict(docs_test)
+  # print report
+  print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
+  ## print out top 10 words
+  clf = pipeline.steps[1][1]
+  vect = pipeline.steps[0][1]
+  for i, class_label in enumerate(dataset.target_names):
+              topt = numpy.argsort(clf.coef_[i])[-10:]
+              print("%s:    %s" % (class_label,
+                    ", ".join(vect.get_feature_names()[j] for j in topt)))
+
+
diff --git a/more-nb-bag-of-words.py b/more-nb-bag-of-words.py
@@ -0,0 +1,68 @@
+# Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes
+
+import sys
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.datasets import load_files
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+from sklearn.pipeline import Pipeline
+import numpy
+
+data_folder = "./data-hold/cleaned/"
+dataset = load_files(data_folder, shuffle = False)
+
+docs_train, docs_test, y_train, y_test = train_test_split(
+    dataset.data, dataset.target, test_size=0.25, random_state=None)
+
+pipeline = Pipeline([
+    ('vect', CountVectorizer(min_df=0.05, max_df=0.75)),
+    ('clf', MultinomialNB()),
+])
+
+pipeline.fit(docs_train, y_train)
+y_predicted = pipeline.predict(docs_test)
+print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
+
+################ Performance report
+#                    precision    recall  f1-score   support
+
+#    charles-m-blow       0.87      0.71      0.78        82
+#      david-brooks       0.93      0.75      0.83       187
+#       frank-bruni       0.69      0.83      0.76        60
+#      gail-collins       0.84      0.89      0.87       169
+#        joe-nocera       0.80      0.83      0.81        81
+#      maureen-dowd       0.76      0.88      0.81       128
+#  nicholas-kristof       0.84      0.83      0.83       123
+#      paul-krugman       0.90      0.91      0.90       154
+#       roger-cohen       0.80      0.86      0.83       115
+#      ross-douthat       0.77      0.71      0.74        48
+# thomas-l-friedman       0.87      0.87      0.87       116
+
+#       avg / total       0.84      0.83      0.83      1263
+
+
+
+
+## print out top 10 most informative features
+clf = pipeline.steps[1][1]
+vect = pipeline.steps[0][1]
+for i, class_label in enumerate(dataset.target_names):
+            topt = numpy.argsort(clf.coef_[i])[-10:]
+            print("%s:    %s" % (class_label,
+                  ", ".join(vect.get_feature_names()[j] for j in topt)))
+
+
+
+# charles-m-blow:    republicans, said, some, our, most, those, were, obama, president, percent
+# david-brooks:    new, government, now, them, over, some, these, do, were, obama
+# frank-bruni:    last, many, just, re, them, had, him, said, her, she
+# gail-collins:    get, state, mr, do, her, were, she, said, new, had
+# joe-nocera:    do, them, years, other, said, she, new, were, its, had
+# maureen-dowd:    hillary, even, were, him, had, president, said, her, obama, she
+# nicholas-kristof:    my, also, some, because, our, year, said, had, her, she
+# paul-krugman:    government, were, health, much, obama, economic, economy, now, even, mr
+# roger-cohen:    had, states, united, american, now, israel, world, iran, obama, its
+# ross-douthat:    well, because, new, many, even, just, party, our, its, obama
+# thomas-l-friedman:    president, america, them, how, its, now, just, do, world, our
+
diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -112,9 +112,10 @@ print(metrics.classification_report(y_test, y_predicted, target_names = dataset.
 
 ##### Results
 
-Precision metrics and then most informative features:
+Precision metrics and then most informative features...not super accurate...yet surprisingly accurate...:
 
 ```
+                   precision    recall  f1-score   support
 
    charles-m-blow       0.59      0.58      0.58        78
      david-brooks       0.78      0.61      0.68       199

diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -78,4 +78,71 @@ paul-krugman: thing which investors mainly aren isn answer even bad large claim
 roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
 ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
 thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today
-```
+```
+
+### Really naive NB classifier
+
+Let's just do Naive Bayes and a plain old bag of words that includes only words used in at least 50% of the corpus:
+
+```python
+import sys
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.datasets import load_files
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+from sklearn.pipeline import Pipeline
+
+data_folder = "./data-hold/cleaned/"
+dataset = load_files(data_folder, shuffle = False)
+print("n_samples: %d" % len(dataset.data))
+
+docs_train, docs_test, y_train, y_test = train_test_split(
+    dataset.data, dataset.target, test_size=0.25, random_state=None)
+
+pipeline = Pipeline([
+    ('vect', CountVectorizer(min_df=0.5)),
+    ('clf', MultinomialNB()),
+])
+
+pipeline.fit(docs_train, y_train)
+y_predicted = pipeline.predict(docs_test)
+print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
+```
+
+##### Results
+
+Precision metrics and then most informative features:
+
+```
+
+   charles-m-blow       0.59      0.58      0.58        78
+     david-brooks       0.78      0.61      0.68       199
+      frank-bruni       0.71      0.63      0.67        75
+     gail-collins       0.77      0.74      0.76       158
+       joe-nocera       0.64      0.63      0.63        70
+     maureen-dowd       0.57      0.74      0.65       121
+ nicholas-kristof       0.84      0.75      0.79       115
+     paul-krugman       0.76      0.81      0.78       153
+      roger-cohen       0.60      0.73      0.66       112
+     ross-douthat       0.71      0.59      0.64        61
+thomas-l-friedman       0.69      0.77      0.73       121
+
+      avg / total       0.71      0.70      0.70      1263
+
+
+
+charles-m-blow: they we have with but be was are on this as for it is that in to and of the
+david-brooks: as be with this you on have for he but they are it is that in and of to the
+frank-bruni: at we be they but was is his as with on for it he in that to of and the
+gail-collins: with have his we this who be you on he was it for is that and in of to the
+joe-nocera: but his be has with had they on as for was he it is and in that of to the
+maureen-dowd: at not be you who for as with was is his on it he that in of and to the
+nicholas-kristof: by be have he was we with are on as but it for is that in of and to the
+paul-krugman: with has they was this are be have as on but for it is in and that of to the
+roger-cohen: an this be but he was not as has with on for it that is in and to of the
+ross-douthat: was by are have this more with be on as but is it for that in to of and the
+thomas-l-friedman: they you this not are be have but on with we for it is that in of to and the
+```
+
+
diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -47,3 +47,35 @@ thomas-l-friedman       0.98      0.98      0.98       126
 
       avg / total       0.97      0.97      0.97      1263
 ```
+
+#### Finding the top 20 features
+
+```python
+import numpy as np
+clf = pipeline.steps[1][1]
+vect = pipeline.steps[0][1]
+feature_names = vect.get_feature_names()
+
+class_labels = dataset.target_names
+for i, class_label in enumerate(class_labels):
+            topt = np.argsort(clf.coef_[i])[-20:]
+            print("%s: %s" % (class_label,
+                  " ".join(feature_names[j] for j in topt)))
+```
+
+
+Results:
+
+```
+charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according
+david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are
+frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which
+gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really
+joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had
+maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington
+nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise
+paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr
+roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
+ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
+thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today
+```
diff --git a/sample-dowd-column.txt b/sample-dowd-column.txt
@@ -0,0 +1,24 @@
+WASHINGTON
+It’s a lost art, slinking away.
+Now the fashion is slinking back.
+Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence.
+We should think of a name for this appalling modern phenomenon. Kissingering, perhaps.
+In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture.
+And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.)
+But no.
+As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?)
+Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers.
+Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government.
+It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize.
+In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve.
+Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”?
+He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling.
+Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.”
+He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place.
+He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong.
+In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him.
+The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter.
+In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying.
+Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics.
+He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe.
+Irrational exuberance, indeed.
diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -1,6 +1,6 @@
 ### Machine learning fun with scikit-learn and NYT columnists
 
-The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text analysis tutorial](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py) on about 5,000 columns gathered across 11 NYT columnists (for example, from [/column/maureen-dowd](http://www.nytimes.com/column/maureen-dowd)
+The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text analysis tutorial](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py) on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on [/column/maureen-dowd](http://www.nytimes.com/column/maureen-dowd).
 
 
 ```python

diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -4,6 +4,14 @@ The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text a
 
 
 ```python
+import sys
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.datasets import load_files
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+
 data_folder = "./data-hold/cleaned/"
 sh_dataset = load_files(data_folder, shuffle = True)
 sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(

diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
@@ -0,0 +1,41 @@
+### Machine learning fun with scikit-learn and NYT columnists
+
+The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text analysis tutorial](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py) on about 5,000 columns gathered across 11 NYT columnists (for example, from [/column/maureen-dowd](http://www.nytimes.com/column/maureen-dowd)
+
+
+```python
+data_folder = "./data-hold/cleaned/"
+sh_dataset = load_files(data_folder, shuffle = True)
+sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
+    sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
+sh_pipeline = Pipeline([
+    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
+    ('clf', LinearSVC(C=1000)),
+])
+
+sh_pipeline.fit(sh_docs_train, sh_y_train)
+sh_y_predicted = sh_pipeline.predict(sh_docs_test)
+
+# print the results
+print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))
+```
+
+Initial results:
+
+```
+                   precision    recall  f1-score   support
+
+   charles-m-blow       0.99      0.94      0.96        81
+     david-brooks       0.98      0.98      0.98       169
+      frank-bruni       1.00      0.98      0.99        64
+     gail-collins       0.99      0.98      0.98       167
+       joe-nocera       0.95      0.95      0.95        76
+     maureen-dowd       0.95      0.98      0.96       125
+ nicholas-kristof       0.93      0.96      0.95       134
+     paul-krugman       0.98      0.99      0.98       157
+      roger-cohen       0.99      0.99      0.99       115
+     ross-douthat       1.00      0.94      0.97        49
+thomas-l-friedman       0.98      0.98      0.98       126
+
+      avg / total       0.97      0.97      0.97      1263
+```