Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Last active August 29, 2015 14:07
Show Gist options
  • Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.
Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.

Revisions

  1. chyikwei revised this gist Oct 14, 2014. 3 changed files with 0 additions and 0 deletions.
    File renamed without changes.
    File renamed without changes.
  2. chyikwei revised this gist Oct 14, 2014. 1 changed file with 74 additions and 0 deletions.
    74 changes: 74 additions & 0 deletions update _dirichlet_expectation
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,74 @@
    File: lda.py
    Function: _dirichlet_expectation at line 26
    Total time: 3.92028 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    26 @profile
    27 def _dirichlet_expectation(alpha):
    28 """
    29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
    30 """
    31 379947 391707 1.0 10.0 if (len(alpha.shape) == 1):
    32 379940 3197582 8.4 81.6 ret = _dirichlet_expectation_1d(alpha)
    33 else:
    34 7 14893 2127.6 0.4 ret = _dirichlet_expectation_2d(alpha)
    35 379947 316096 0.8 8.1 return ret

    File: lda.py
    Function: _update_gamma at line 38
    Total time: 21.0102 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    38 @profile
    39 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
    40 meanchangethresh, cal_delta):
    41 """
    42 E-step: update latent variable gamma
    43 """
    44
    45 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
    46 2 4 2.0 0.0 n_topics = expElogbeta.shape[0]
    47
    48 # gamma is non-normailzed topic distribution
    49 2 4959 2479.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
    50 2 5692 2846.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
    51 # diff on component (only calculate it when keep_comp_change is True)
    52 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
    53
    54 2 4 2.0 0.0 X_data = X.data
    55 2 3 1.5 0.0 X_indices = X.indices
    56 2 2 1.0 0.0 X_indptr = X.indptr
    57
    58 8002 12836 1.6 0.1 for d in xrange(n_docs):
    59 8000 26909 3.4 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
    60 8000 21216 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
    61 8000 36913 4.6 0.2 gammad = gamma[d, :]
    62 8000 28754 3.6 0.1 expElogthetad = expElogtheta[d, :]
    63 8000 106489 13.3 0.5 expElogbetad = expElogbeta[:, ids]
    64 # The optimal phi_{dwk} is proportional to
    65 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
    66 8000 80909 10.1 0.4 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    67
    68 # Iterate between gamma and phi until convergence
    69 381325 466912 1.2 2.2 for it in xrange(0, max_iters):
    70 379940 785902 2.1 3.7 lastgamma = gammad
    71 # We represent phi implicitly to save memory and time.
    72 # Substituting the value of the optimal phi back into
    73 # the update for gamma gives this update. Cf. Lee&Seung 2001.
    74 379940 455282 1.2 2.2 gammad = alpha + expElogthetad * \
    75 379940 5387921 14.2 25.6 np.dot(cnts / phinorm, expElogbetad.T)
    76 379940 7855665 20.7 37.4 expElogthetad = np.exp(_dirichlet_expectation(gammad))
    77 379940 3367420 8.9 16.0 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    78
    79 379940 1301467 3.4 6.2 meanchange = mean_change(lastgamma, gammad)
    80 379940 542443 1.4 2.6 if (meanchange < meanchangethresh):
    81 6615 8323 1.3 0.0 break
    82 8000 50913 6.4 0.2 gamma[d, :] = gammad
    83 # Contribution of document d to the expected sufficient
    84 # statistics for the M step.
    85 8000 10525 1.3 0.1 if cal_delta:
    86 8000 452723 56.6 2.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
    87
    88 2 3 1.5 0.0 return (gamma, delta_component)
  3. chyikwei revised this gist Oct 14, 2014. 1 changed file with 72 additions and 0 deletions.
    72 changes: 72 additions & 0 deletions add mean_change to cython
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,72 @@
    File: lda.py
    Function: _dirichlet_expectation at line 26
    Total time: 8.93651 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    26 @profile
    27 def _dirichlet_expectation(alpha):
    28 """
    29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
    30 """
    31 379947 391754 1.0 4.4 if (len(alpha.shape) == 1):
    32 379940 8532031 22.5 95.5 return(psi(alpha) - psi(np.sum(alpha)))
    33 7 12729 1818.4 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])

    File: lda.py
    Function: _update_gamma at line 35
    Total time: 25.8925 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    35 @profile
    36 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
    37 meanchangethresh, cal_delta):
    38 """
    39 E-step: update latent variable gamma
    40 """
    41
    42 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
    43 2 5 2.5 0.0 n_topics = expElogbeta.shape[0]
    44
    45 # gamma is non-normailzed topic distribution
    46 2 4931 2465.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
    47 2 5778 2889.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
    48 # diff on component (only calculate it when keep_comp_change is True)
    49 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
    50
    51 2 4 2.0 0.0 X_data = X.data
    52 2 3 1.5 0.0 X_indices = X.indices
    53 2 2 1.0 0.0 X_indptr = X.indptr
    54
    55 8002 12479 1.6 0.0 for d in xrange(n_docs):
    56 8000 26147 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
    57 8000 21494 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
    58 8000 37107 4.6 0.1 gammad = gamma[d, :]
    59 8000 29111 3.6 0.1 expElogthetad = expElogtheta[d, :]
    60 8000 99660 12.5 0.4 expElogbetad = expElogbeta[:, ids]
    61 # The optimal phi_{dwk} is proportional to
    62 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
    63 8000 79255 9.9 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    64
    65 # Iterate between gamma and phi until convergence
    66 381325 473084 1.2 1.8 for it in xrange(0, max_iters):
    67 379940 771424 2.0 3.0 lastgamma = gammad
    68 # We represent phi implicitly to save memory and time.
    69 # Substituting the value of the optimal phi back into
    70 # the update for gamma gives this update. Cf. Lee&Seung 2001.
    71 379940 453302 1.2 1.8 gammad = alpha + expElogthetad * \
    72 379940 5402250 14.2 20.9 np.dot(cnts / phinorm, expElogbetad.T)
    73 379940 12609292 33.2 48.7 expElogthetad = np.exp(_dirichlet_expectation(gammad))
    74 379940 3407782 9.0 13.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    75
    76 379940 1417461 3.7 5.5 meanchange = mean_change(lastgamma, gammad)
    77 379940 532688 1.4 2.1 if (meanchange < meanchangethresh):
    78 6615 8396 1.3 0.0 break
    79 8000 50124 6.3 0.2 gamma[d, :] = gammad
    80 # Contribution of document d to the expected sufficient
    81 # statistics for the M step.
    82 8000 10207 1.3 0.0 if cal_delta:
    83 8000 440468 55.1 1.7 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
    84
    85 2 3 1.5 0.0 return (gamma, delta_component)
  4. chyikwei created this gist Oct 14, 2014.
    72 changes: 72 additions & 0 deletions original
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,72 @@
    File: lda.py
    Function: _dirichlet_expectation at line 24
    Total time: 8.96912 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    24 @profile
    25 def _dirichlet_expectation(alpha):
    26 """
    27 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
    28 """
    29 379947 411076 1.1 4.6 if (len(alpha.shape) == 1):
    30 379940 8545062 22.5 95.3 return(psi(alpha) - psi(np.sum(alpha)))
    31 7 12980 1854.3 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])

    File: lda.py
    Function: _update_gamma at line 33
    Total time: 37.1273 s

    Line # Hits Time Per Hit % Time Line Contents
    ==============================================================
    33 @profile
    34 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
    35 meanchangethresh, cal_delta):
    36 """
    37 E-step: update latent variable gamma
    38 """
    39
    40 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
    41 2 4 2.0 0.0 n_topics = expElogbeta.shape[0]
    42
    43 # gamma is non-normailzed topic distribution
    44 2 5032 2516.0 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
    45 2 5883 2941.5 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
    46 # diff on component (only calculate it when keep_comp_change is True)
    47 2 70 35.0 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
    48
    49 2 3 1.5 0.0 X_data = X.data
    50 2 2 1.0 0.0 X_indices = X.indices
    51 2 2 1.0 0.0 X_indptr = X.indptr
    52
    53 8002 12721 1.6 0.0 for d in xrange(n_docs):
    54 8000 25173 3.1 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
    55 8000 19870 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
    56 8000 30900 3.9 0.1 gammad = gamma[d, :]
    57 8000 26641 3.3 0.1 expElogthetad = expElogtheta[d, :]
    58 8000 104626 13.1 0.3 expElogbetad = expElogbeta[:, ids]
    59 # The optimal phi_{dwk} is proportional to
    60 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
    61 8000 79777 10.0 0.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    62
    63 # Iterate between gamma and phi until convergence
    64 381325 467124 1.2 1.3 for it in xrange(0, max_iters):
    65 379940 565605 1.5 1.5 lastgamma = gammad
    66 # We represent phi implicitly to save memory and time.
    67 # Substituting the value of the optimal phi back into
    68 # the update for gamma gives this update. Cf. Lee&Seung 2001.
    69 379940 428819 1.1 1.2 gammad = alpha + expElogthetad * \
    70 379940 5605904 14.8 15.1 np.dot(cnts / phinorm, expElogbetad.T)
    71 379940 12712990 33.5 34.2 expElogthetad = np.exp(_dirichlet_expectation(gammad))
    72 379940 3375137 8.9 9.1 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    73
    74 379940 12524287 33.0 33.7 meanchange = np.mean(abs(gammad - lastgamma))
    75 379940 620657 1.6 1.7 if (meanchange < meanchangethresh):
    76 6615 8065 1.2 0.0 break
    77 8000 50140 6.3 0.1 gamma[d, :] = gammad
    78 # Contribution of document d to the expected sufficient
    79 # statistics for the M step.
    80 8000 9904 1.2 0.0 if cal_delta:
    81 8000 447906 56.0 1.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
    82
    83 2 3 1.5 0.0 return (gamma, delta_component)