Last active
August 29, 2015 14:07
-
-
Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.
Revisions
-
chyikwei revised this gist
Oct 14, 2014 . 3 changed files with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes.File renamed without changes.File renamed without changes. -
chyikwei revised this gist
Oct 14, 2014 . 1 changed file with 74 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,74 @@ File: lda.py Function: _dirichlet_expectation at line 26 Total time: 3.92028 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 26 @profile 27 def _dirichlet_expectation(alpha): 28 """ 29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. 30 """ 31 379947 391707 1.0 10.0 if (len(alpha.shape) == 1): 32 379940 3197582 8.4 81.6 ret = _dirichlet_expectation_1d(alpha) 33 else: 34 7 14893 2127.6 0.4 ret = _dirichlet_expectation_2d(alpha) 35 379947 316096 0.8 8.1 return ret File: lda.py Function: _update_gamma at line 38 Total time: 21.0102 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 38 @profile 39 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, 40 meanchangethresh, cal_delta): 41 """ 42 E-step: update latent variable gamma 43 """ 44 45 2 8 4.0 0.0 n_docs, n_vocabs = X.shape 46 2 4 2.0 0.0 n_topics = expElogbeta.shape[0] 47 48 # gamma is non-normailzed topic distribution 49 2 4959 2479.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) 50 2 5692 2846.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) 51 # diff on component (only calculate it when keep_comp_change is True) 52 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None 53 54 2 4 2.0 0.0 X_data = X.data 55 2 3 1.5 0.0 X_indices = X.indices 56 2 2 1.0 0.0 X_indptr = X.indptr 57 58 8002 12836 1.6 0.1 for d in xrange(n_docs): 59 8000 26909 3.4 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] 60 8000 21216 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] 61 8000 36913 4.6 0.2 gammad = gamma[d, :] 62 8000 28754 3.6 0.1 expElogthetad = expElogtheta[d, :] 63 8000 106489 13.3 0.5 expElogbetad = expElogbeta[:, ids] 64 # The optimal phi_{dwk} is proportional to 65 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. 66 8000 80909 10.1 0.4 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 67 68 # Iterate between gamma and phi until convergence 69 381325 466912 1.2 2.2 for it in xrange(0, max_iters): 70 379940 785902 2.1 3.7 lastgamma = gammad 71 # We represent phi implicitly to save memory and time. 72 # Substituting the value of the optimal phi back into 73 # the update for gamma gives this update. Cf. Lee&Seung 2001. 74 379940 455282 1.2 2.2 gammad = alpha + expElogthetad * \ 75 379940 5387921 14.2 25.6 np.dot(cnts / phinorm, expElogbetad.T) 76 379940 7855665 20.7 37.4 expElogthetad = np.exp(_dirichlet_expectation(gammad)) 77 379940 3367420 8.9 16.0 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 78 79 379940 1301467 3.4 6.2 meanchange = mean_change(lastgamma, gammad) 80 379940 542443 1.4 2.6 if (meanchange < meanchangethresh): 81 6615 8323 1.3 0.0 break 82 8000 50913 6.4 0.2 gamma[d, :] = gammad 83 # Contribution of document d to the expected sufficient 84 # statistics for the M step. 85 8000 10525 1.3 0.1 if cal_delta: 86 8000 452723 56.6 2.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) 87 88 2 3 1.5 0.0 return (gamma, delta_component) -
chyikwei revised this gist
Oct 14, 2014 . 1 changed file with 72 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,72 @@ File: lda.py Function: _dirichlet_expectation at line 26 Total time: 8.93651 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 26 @profile 27 def _dirichlet_expectation(alpha): 28 """ 29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. 30 """ 31 379947 391754 1.0 4.4 if (len(alpha.shape) == 1): 32 379940 8532031 22.5 95.5 return(psi(alpha) - psi(np.sum(alpha))) 33 7 12729 1818.4 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]) File: lda.py Function: _update_gamma at line 35 Total time: 25.8925 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 35 @profile 36 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, 37 meanchangethresh, cal_delta): 38 """ 39 E-step: update latent variable gamma 40 """ 41 42 2 8 4.0 0.0 n_docs, n_vocabs = X.shape 43 2 5 2.5 0.0 n_topics = expElogbeta.shape[0] 44 45 # gamma is non-normailzed topic distribution 46 2 4931 2465.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) 47 2 5778 2889.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) 48 # diff on component (only calculate it when keep_comp_change is True) 49 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None 50 51 2 4 2.0 0.0 X_data = X.data 52 2 3 1.5 0.0 X_indices = X.indices 53 2 2 1.0 0.0 X_indptr = X.indptr 54 55 8002 12479 1.6 0.0 for d in xrange(n_docs): 56 8000 26147 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] 57 8000 21494 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] 58 8000 37107 4.6 0.1 gammad = gamma[d, :] 59 8000 29111 3.6 0.1 expElogthetad = expElogtheta[d, :] 60 8000 99660 12.5 0.4 expElogbetad = expElogbeta[:, ids] 61 # The optimal phi_{dwk} is proportional to 62 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. 63 8000 79255 9.9 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 64 65 # Iterate between gamma and phi until convergence 66 381325 473084 1.2 1.8 for it in xrange(0, max_iters): 67 379940 771424 2.0 3.0 lastgamma = gammad 68 # We represent phi implicitly to save memory and time. 69 # Substituting the value of the optimal phi back into 70 # the update for gamma gives this update. Cf. Lee&Seung 2001. 71 379940 453302 1.2 1.8 gammad = alpha + expElogthetad * \ 72 379940 5402250 14.2 20.9 np.dot(cnts / phinorm, expElogbetad.T) 73 379940 12609292 33.2 48.7 expElogthetad = np.exp(_dirichlet_expectation(gammad)) 74 379940 3407782 9.0 13.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 75 76 379940 1417461 3.7 5.5 meanchange = mean_change(lastgamma, gammad) 77 379940 532688 1.4 2.1 if (meanchange < meanchangethresh): 78 6615 8396 1.3 0.0 break 79 8000 50124 6.3 0.2 gamma[d, :] = gammad 80 # Contribution of document d to the expected sufficient 81 # statistics for the M step. 82 8000 10207 1.3 0.0 if cal_delta: 83 8000 440468 55.1 1.7 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) 84 85 2 3 1.5 0.0 return (gamma, delta_component) -
chyikwei created this gist
Oct 14, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,72 @@ File: lda.py Function: _dirichlet_expectation at line 24 Total time: 8.96912 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 24 @profile 25 def _dirichlet_expectation(alpha): 26 """ 27 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. 28 """ 29 379947 411076 1.1 4.6 if (len(alpha.shape) == 1): 30 379940 8545062 22.5 95.3 return(psi(alpha) - psi(np.sum(alpha))) 31 7 12980 1854.3 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]) File: lda.py Function: _update_gamma at line 33 Total time: 37.1273 s Line # Hits Time Per Hit % Time Line Contents ============================================================== 33 @profile 34 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, 35 meanchangethresh, cal_delta): 36 """ 37 E-step: update latent variable gamma 38 """ 39 40 2 8 4.0 0.0 n_docs, n_vocabs = X.shape 41 2 4 2.0 0.0 n_topics = expElogbeta.shape[0] 42 43 # gamma is non-normailzed topic distribution 44 2 5032 2516.0 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) 45 2 5883 2941.5 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) 46 # diff on component (only calculate it when keep_comp_change is True) 47 2 70 35.0 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None 48 49 2 3 1.5 0.0 X_data = X.data 50 2 2 1.0 0.0 X_indices = X.indices 51 2 2 1.0 0.0 X_indptr = X.indptr 52 53 8002 12721 1.6 0.0 for d in xrange(n_docs): 54 8000 25173 3.1 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] 55 8000 19870 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] 56 8000 30900 3.9 0.1 gammad = gamma[d, :] 57 8000 26641 3.3 0.1 expElogthetad = expElogtheta[d, :] 58 8000 104626 13.1 0.3 expElogbetad = expElogbeta[:, ids] 59 # The optimal phi_{dwk} is proportional to 60 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. 61 8000 79777 10.0 0.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 62 63 # Iterate between gamma and phi until convergence 64 381325 467124 1.2 1.3 for it in xrange(0, max_iters): 65 379940 565605 1.5 1.5 lastgamma = gammad 66 # We represent phi implicitly to save memory and time. 67 # Substituting the value of the optimal phi back into 68 # the update for gamma gives this update. Cf. Lee&Seung 2001. 69 379940 428819 1.1 1.2 gammad = alpha + expElogthetad * \ 70 379940 5605904 14.8 15.1 np.dot(cnts / phinorm, expElogbetad.T) 71 379940 12712990 33.5 34.2 expElogthetad = np.exp(_dirichlet_expectation(gammad)) 72 379940 3375137 8.9 9.1 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 73 74 379940 12524287 33.0 33.7 meanchange = np.mean(abs(gammad - lastgamma)) 75 379940 620657 1.6 1.7 if (meanchange < meanchangethresh): 76 6615 8065 1.2 0.0 break 77 8000 50140 6.3 0.1 gamma[d, :] = gammad 78 # Contribution of document d to the expected sufficient 79 # statistics for the M step. 80 8000 9904 1.2 0.0 if cal_delta: 81 8000 447906 56.0 1.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) 82 83 2 3 1.5 0.0 return (gamma, delta_component)