Last active
February 10, 2022 16:45
-
-
Save dannguyen/c1f5fce7a063e16f7aa7 to your computer and use it in GitHub Desktop.
Revisions
-
dannguyen renamed this gist
Mar 9, 2015 . 1 changed file with 15 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,27 +10,35 @@ import sys from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer import numpy ## make a Multinomial-NB and CountVectorized pipeline by default ## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'): if vect == 'tfidf': V = TfidfVectorizer else: V = CountVectorizer if clf == 'lsvc': C = LinearSVC else: C = MultinomialNB pipeline = Pipeline([ ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)), ('clf', C()), ]) return pipeline -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 18 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,12 +1,18 @@ # some convenience functions here, nothing new ''' # usage: from easypipe import easy_pipeline from easypipe import print_metrics data_folder = "data-hold/20news" p = easy_pipeline() print_metrics(p, data_folder) ''' import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics @@ -16,16 +22,22 @@ ## make a Multinomial-NB and CountVectorized pipeline def easy_pipeline(vect = 'tfidf', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'): if vect == 'tfidf': V = TfidfVectorizer else: V = CountVectorizer pipeline = Pipeline([ ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)), ('clf', MultinomialNB()), ]) return pipeline ## Print the precision/recall/F1 numbers per label, and also ## the top-10 most informative features per label def print_metrics(pipeline, data_folder, test_size = 0.25): dataset = load_files(data_folder, shuffle = False) docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size = test_size, random_state=None) pipeline.fit(docs_train, y_train) @@ -40,4 +52,3 @@ def print_metrics(pipeline, dataset, test_size = 0.25): print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt))) -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 43 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,43 @@ # some convenience functions here, nothing new # usage: # # data_folder = "stuff/" # dataset = load_files(data_folder, shuffle = False) import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline import numpy ## make a Multinomial-NB and CountVectorized pipeline def easy_pipeline(min_df = 0.01, max_df = 0.95, stop_words = None): pipeline = Pipeline([ ('vect', CountVectorizer(min_df=min_df, max_df=max_df, stop_words = stop_words)), ('clf', MultinomialNB()), ]) return pipeline ## Print the precision/recall/F1 numbers per label, and also ## the top-10 most informative features per label def print_metrics(pipeline, dataset, test_size = 0.25): docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size = test_size, random_state=None) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) # print report print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ## print out top 10 words clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] for i, class_label in enumerate(dataset.target_names): topt = numpy.argsort(clf.coef_[i])[-10:] print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt))) -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 68 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ # Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline import numpy data_folder = "./data-hold/cleaned/" dataset = load_files(data_folder, shuffle = False) docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) pipeline = Pipeline([ ('vect', CountVectorizer(min_df=0.05, max_df=0.75)), ('clf', MultinomialNB()), ]) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ################ Performance report # precision recall f1-score support # charles-m-blow 0.87 0.71 0.78 82 # david-brooks 0.93 0.75 0.83 187 # frank-bruni 0.69 0.83 0.76 60 # gail-collins 0.84 0.89 0.87 169 # joe-nocera 0.80 0.83 0.81 81 # maureen-dowd 0.76 0.88 0.81 128 # nicholas-kristof 0.84 0.83 0.83 123 # paul-krugman 0.90 0.91 0.90 154 # roger-cohen 0.80 0.86 0.83 115 # ross-douthat 0.77 0.71 0.74 48 # thomas-l-friedman 0.87 0.87 0.87 116 # avg / total 0.84 0.83 0.83 1263 ## print out top 10 most informative features clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] for i, class_label in enumerate(dataset.target_names): topt = numpy.argsort(clf.coef_[i])[-10:] print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt))) # charles-m-blow: republicans, said, some, our, most, those, were, obama, president, percent # david-brooks: new, government, now, them, over, some, these, do, were, obama # frank-bruni: last, many, just, re, them, had, him, said, her, she # gail-collins: get, state, mr, do, her, were, she, said, new, had # joe-nocera: do, them, years, other, said, she, new, were, its, had # maureen-dowd: hillary, even, were, him, had, president, said, her, obama, she # nicholas-kristof: my, also, some, because, our, year, said, had, her, she # paul-krugman: government, were, health, much, obama, economic, economy, now, even, mr # roger-cohen: had, states, united, american, now, israel, world, iran, obama, its # ross-douthat: well, because, new, many, even, just, party, our, its, obama # thomas-l-friedman: president, america, them, how, its, now, just, do, world, our -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -112,9 +112,10 @@ print(metrics.classification_report(y_test, y_predicted, target_names = dataset. ##### Results Precision metrics and then most informative features...not super accurate...yet surprisingly accurate...: ``` precision recall f1-score support charles-m-blow 0.59 0.58 0.58 78 david-brooks 0.78 0.61 0.68 199 -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 68 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -78,4 +78,71 @@ paul-krugman: thing which investors mainly aren isn answer even bad large claim roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today ``` ### Really naive NB classifier Let's just do Naive Bayes and a plain old bag of words that includes only words used in at least 50% of the corpus: ```python import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline data_folder = "./data-hold/cleaned/" dataset = load_files(data_folder, shuffle = False) print("n_samples: %d" % len(dataset.data)) docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) pipeline = Pipeline([ ('vect', CountVectorizer(min_df=0.5)), ('clf', MultinomialNB()), ]) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ``` ##### Results Precision metrics and then most informative features: ``` charles-m-blow 0.59 0.58 0.58 78 david-brooks 0.78 0.61 0.68 199 frank-bruni 0.71 0.63 0.67 75 gail-collins 0.77 0.74 0.76 158 joe-nocera 0.64 0.63 0.63 70 maureen-dowd 0.57 0.74 0.65 121 nicholas-kristof 0.84 0.75 0.79 115 paul-krugman 0.76 0.81 0.78 153 roger-cohen 0.60 0.73 0.66 112 ross-douthat 0.71 0.59 0.64 61 thomas-l-friedman 0.69 0.77 0.73 121 avg / total 0.71 0.70 0.70 1263 charles-m-blow: they we have with but be was are on this as for it is that in to and of the david-brooks: as be with this you on have for he but they are it is that in and of to the frank-bruni: at we be they but was is his as with on for it he in that to of and the gail-collins: with have his we this who be you on he was it for is that and in of to the joe-nocera: but his be has with had they on as for was he it is and in that of to the maureen-dowd: at not be you who for as with was is his on it he that in of and to the nicholas-kristof: by be have he was we with are on as but it for is that in of and to the paul-krugman: with has they was this are be have as on but for it is in and that of to the roger-cohen: an this be but he was not as has with on for it that is in and to of the ross-douthat: was by are have this more with be on as but is it for that in to of and the thomas-l-friedman: they you this not are be have but on with we for it is that in of to and the ``` -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 32 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -47,3 +47,35 @@ thomas-l-friedman 0.98 0.98 0.98 126 avg / total 0.97 0.97 0.97 1263 ``` #### Finding the top 20 features ```python import numpy as np clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] feature_names = vect.get_feature_names() class_labels = dataset.target_names for i, class_label in enumerate(class_labels): topt = np.argsort(clf.coef_[i])[-20:] print("%s: %s" % (class_label, " ".join(feature_names[j] for j in topt))) ``` Results: ``` charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today ``` -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 24 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,24 @@ WASHINGTON It’s a lost art, slinking away. Now the fashion is slinking back. Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence. We should think of a name for this appalling modern phenomenon. Kissingering, perhaps. In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture. And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.) But no. As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?) Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers. Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government. It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize. In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve. Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”? He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling. Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.” He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place. He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong. In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him. The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter. In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying. Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics. He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe. Irrational exuberance, indeed. -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ ### Machine learning fun with scikit-learn and NYT columnists The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text analysis tutorial](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py) on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on [/column/maureen-dowd](http://www.nytimes.com/column/maureen-dowd). ```python -
dannguyen revised this gist
Mar 9, 2015 . 1 changed file with 8 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,6 +4,14 @@ The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text a ```python import sys from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC data_folder = "./data-hold/cleaned/" sh_dataset = load_files(data_folder, shuffle = True) sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split( -
dannguyen created this gist
Mar 9, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,41 @@ ### Machine learning fun with scikit-learn and NYT columnists The use of TF-IDF and LinearSVC is copied [verbatim from the scikit-learn text analysis tutorial](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py) on about 5,000 columns gathered across 11 NYT columnists (for example, from [/column/maureen-dowd](http://www.nytimes.com/column/maureen-dowd) ```python data_folder = "./data-hold/cleaned/" sh_dataset = load_files(data_folder, shuffle = True) sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split( sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None) sh_pipeline = Pipeline([ ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), ('clf', LinearSVC(C=1000)), ]) sh_pipeline.fit(sh_docs_train, sh_y_train) sh_y_predicted = sh_pipeline.predict(sh_docs_test) # print the results print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names)) ``` Initial results: ``` precision recall f1-score support charles-m-blow 0.99 0.94 0.96 81 david-brooks 0.98 0.98 0.98 169 frank-bruni 1.00 0.98 0.99 64 gail-collins 0.99 0.98 0.98 167 joe-nocera 0.95 0.95 0.95 76 maureen-dowd 0.95 0.98 0.96 125 nicholas-kristof 0.93 0.96 0.95 134 paul-krugman 0.98 0.99 0.98 157 roger-cohen 0.99 0.99 0.99 115 ross-douthat 1.00 0.94 0.97 49 thomas-l-friedman 0.98 0.98 0.98 126 avg / total 0.97 0.97 0.97 1263 ```