ululh · February 1, 2023 09:32 · Jun 25, 2017 · Jun 25, 2017 · Jun 25, 2017 · Jun 25, 2017
diff --git a/LDApredict.py b/LDApredict.py
@@ -1,5 +1,5 @@
 # derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
-# explanations are located there :
+# explanations are located there : https://www.linkedin.com/pulse/dissociating-training-predicting-latent-dirichlet-lucien-tardres
 
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation

diff --git a/LDApredict.py b/LDApredict.py
@@ -12,7 +12,7 @@
 with open ('outfile', 'rb') as fd:
     (features,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_) = pickle.load(fd)
 
-# the dataset to predict on (first two sample were also in the training set so you can compare)
+# the dataset to predict on (first two samples were also in the training set so one can compare)
 data_samples = ["I like to eat broccoli and bananas.",
                 "I ate a banana and spinach smoothie for breakfast.",
                 "kittens and dogs are boring"

diff --git a/gistfile1.txt → LDApredict.py b/gistfile1.txt → LDApredict.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,26 @@
+# derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
+# explanations are located there :
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import pickle
+
+# create a blank model
+lda = LatentDirichletAllocation()
+
+# load parameters from file
+with open ('outfile', 'rb') as fd:
+    (features,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_) = pickle.load(fd)
+
+# the dataset to predict on (first two sample were also in the training set so you can compare)
+data_samples = ["I like to eat broccoli and bananas.",
+                "I ate a banana and spinach smoothie for breakfast.",
+                "kittens and dogs are boring"
+               ]
+# Vectorize the training set using the model features as vocabulary
+tf_vectorizer = CountVectorizer(vocabulary=features)
+tf = tf_vectorizer.fit_transform(data_samples)
+
+# transform method returns a matrix with one line per document, columns being topics weight
+predict = lda.transform(tf)
+print(predict)