chyikwei · August 29, 2015 14:07 · Oct 14, 2014 · Oct 14, 2014 · Oct 14, 2014 · Oct 14, 2014
diff --git a/original → 01_original b/original → 01_original
diff --git a/add mean_change to cython → 02_add mean_change to cython b/add mean_change to cython → 02_add mean_change to cython
diff --git a/update _dirichlet_expectation → 03_update _dirichlet_expectation b/update _dirichlet_expectation → 03_update _dirichlet_expectation
diff --git a/update _dirichlet_expectation b/update _dirichlet_expectation
@@ -0,0 +1,74 @@
+File: lda.py
+Function: _dirichlet_expectation at line 26
+Total time: 3.92028 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    26                                           @profile
+    27                                           def _dirichlet_expectation(alpha):
+    28                                               """
+    29                                               For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
+    30                                               """
+    31    379947       391707      1.0     10.0      if (len(alpha.shape) == 1):
+    32    379940      3197582      8.4     81.6          ret = _dirichlet_expectation_1d(alpha)
+    33                                               else:
+    34         7        14893   2127.6      0.4          ret = _dirichlet_expectation_2d(alpha)
+    35    379947       316096      0.8      8.1      return ret
+
+File: lda.py
+Function: _update_gamma at line 38
+Total time: 21.0102 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    38                                           @profile
+    39                                           def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
+    40                                                             meanchangethresh, cal_delta):
+    41                                               """
+    42                                               E-step: update latent variable gamma
+    43                                               """
+    44
+    45         2            8      4.0      0.0      n_docs, n_vocabs = X.shape
+    46         2            4      2.0      0.0      n_topics = expElogbeta.shape[0]
+    47
+    48                                               # gamma is non-normailzed topic distribution
+    49         2         4959   2479.5      0.0      gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
+    50         2         5692   2846.0      0.0      expElogtheta = np.exp(_dirichlet_expectation(gamma))
+    51                                               # diff on component (only calculate it when keep_comp_change is True)
+    52         2           23     11.5      0.0      delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
+    53
+    54         2            4      2.0      0.0      X_data = X.data
+    55         2            3      1.5      0.0      X_indices = X.indices
+    56         2            2      1.0      0.0      X_indptr = X.indptr
+    57
+    58      8002        12836      1.6      0.1      for d in xrange(n_docs):
+    59      8000        26909      3.4      0.1          ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
+    60      8000        21216      2.7      0.1          cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
+    61      8000        36913      4.6      0.2          gammad = gamma[d, :]
+    62      8000        28754      3.6      0.1          expElogthetad = expElogtheta[d, :]
+    63      8000       106489     13.3      0.5          expElogbetad = expElogbeta[:, ids]
+    64                                                   # The optimal phi_{dwk} is proportional to
+    65                                                   # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
+    66      8000        80909     10.1      0.4          phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    67
+    68                                                   # Iterate between gamma and phi until convergence
+    69    381325       466912      1.2      2.2          for it in xrange(0, max_iters):
+    70    379940       785902      2.1      3.7              lastgamma = gammad
+    71                                                       # We represent phi implicitly to save memory and time.
+    72                                                       # Substituting the value of the optimal phi back into
+    73                                                       # the update for gamma gives this update. Cf. Lee&Seung 2001.
+    74    379940       455282      1.2      2.2              gammad = alpha + expElogthetad * \
+    75    379940      5387921     14.2     25.6                  np.dot(cnts / phinorm, expElogbetad.T)
+    76    379940      7855665     20.7     37.4              expElogthetad = np.exp(_dirichlet_expectation(gammad))
+    77    379940      3367420      8.9     16.0              phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    78
+    79    379940      1301467      3.4      6.2              meanchange = mean_change(lastgamma, gammad)
+    80    379940       542443      1.4      2.6              if (meanchange < meanchangethresh):
+    81      6615         8323      1.3      0.0                  break
+    82      8000        50913      6.4      0.2          gamma[d, :] = gammad
+    83                                                   # Contribution of document d to the expected sufficient
+    84                                                   # statistics for the M step.
+    85      8000        10525      1.3      0.1          if cal_delta:
+    86      8000       452723     56.6      2.2              delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
+    87
+    88         2            3      1.5      0.0      return (gamma, delta_component)
diff --git a/add mean_change to cython b/add mean_change to cython
@@ -0,0 +1,72 @@
+File: lda.py
+Function: _dirichlet_expectation at line 26
+Total time: 8.93651 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    26                                           @profile
+    27                                           def _dirichlet_expectation(alpha):
+    28                                               """
+    29                                               For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
+    30                                               """
+    31    379947       391754      1.0      4.4      if (len(alpha.shape) == 1):
+    32    379940      8532031     22.5     95.5          return(psi(alpha) - psi(np.sum(alpha)))
+    33         7        12729   1818.4      0.1      return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])
+
+File: lda.py
+Function: _update_gamma at line 35
+Total time: 25.8925 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    35                                           @profile
+    36                                           def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
+    37                                                             meanchangethresh, cal_delta):
+    38                                               """
+    39                                               E-step: update latent variable gamma
+    40                                               """
+    41
+    42         2            8      4.0      0.0      n_docs, n_vocabs = X.shape
+    43         2            5      2.5      0.0      n_topics = expElogbeta.shape[0]
+    44
+    45                                               # gamma is non-normailzed topic distribution
+    46         2         4931   2465.5      0.0      gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
+    47         2         5778   2889.0      0.0      expElogtheta = np.exp(_dirichlet_expectation(gamma))
+    48                                               # diff on component (only calculate it when keep_comp_change is True)
+    49         2           23     11.5      0.0      delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
+    50
+    51         2            4      2.0      0.0      X_data = X.data
+    52         2            3      1.5      0.0      X_indices = X.indices
+    53         2            2      1.0      0.0      X_indptr = X.indptr
+    54
+    55      8002        12479      1.6      0.0      for d in xrange(n_docs):
+    56      8000        26147      3.3      0.1          ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
+    57      8000        21494      2.7      0.1          cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
+    58      8000        37107      4.6      0.1          gammad = gamma[d, :]
+    59      8000        29111      3.6      0.1          expElogthetad = expElogtheta[d, :]
+    60      8000        99660     12.5      0.4          expElogbetad = expElogbeta[:, ids]
+    61                                                   # The optimal phi_{dwk} is proportional to
+    62                                                   # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
+    63      8000        79255      9.9      0.3          phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    64
+    65                                                   # Iterate between gamma and phi until convergence
+    66    381325       473084      1.2      1.8          for it in xrange(0, max_iters):
+    67    379940       771424      2.0      3.0              lastgamma = gammad
+    68                                                       # We represent phi implicitly to save memory and time.
+    69                                                       # Substituting the value of the optimal phi back into
+    70                                                       # the update for gamma gives this update. Cf. Lee&Seung 2001.
+    71    379940       453302      1.2      1.8              gammad = alpha + expElogthetad * \
+    72    379940      5402250     14.2     20.9                  np.dot(cnts / phinorm, expElogbetad.T)
+    73    379940     12609292     33.2     48.7              expElogthetad = np.exp(_dirichlet_expectation(gammad))
+    74    379940      3407782      9.0     13.2              phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    75
+    76    379940      1417461      3.7      5.5              meanchange = mean_change(lastgamma, gammad)
+    77    379940       532688      1.4      2.1              if (meanchange < meanchangethresh):
+    78      6615         8396      1.3      0.0                  break
+    79      8000        50124      6.3      0.2          gamma[d, :] = gammad
+    80                                                   # Contribution of document d to the expected sufficient
+    81                                                   # statistics for the M step.
+    82      8000        10207      1.3      0.0          if cal_delta:
+    83      8000       440468     55.1      1.7              delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
+    84
+    85         2            3      1.5      0.0      return (gamma, delta_component)
diff --git a/original b/original
@@ -0,0 +1,72 @@
+File: lda.py
+Function: _dirichlet_expectation at line 24
+Total time: 8.96912 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    24                                           @profile
+    25                                           def _dirichlet_expectation(alpha):
+    26                                               """
+    27                                               For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
+    28                                               """
+    29    379947       411076      1.1      4.6      if (len(alpha.shape) == 1):
+    30    379940      8545062     22.5     95.3          return(psi(alpha) - psi(np.sum(alpha)))
+    31         7        12980   1854.3      0.1      return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])
+
+File: lda.py
+Function: _update_gamma at line 33
+Total time: 37.1273 s
+
+Line #      Hits         Time  Per Hit   % Time  Line Contents
+==============================================================
+    33                                           @profile
+    34                                           def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
+    35                                                             meanchangethresh, cal_delta):
+    36                                               """
+    37                                               E-step: update latent variable gamma
+    38                                               """
+    39
+    40         2            8      4.0      0.0      n_docs, n_vocabs = X.shape
+    41         2            4      2.0      0.0      n_topics = expElogbeta.shape[0]
+    42
+    43                                               # gamma is non-normailzed topic distribution
+    44         2         5032   2516.0      0.0      gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
+    45         2         5883   2941.5      0.0      expElogtheta = np.exp(_dirichlet_expectation(gamma))
+    46                                               # diff on component (only calculate it when keep_comp_change is True)
+    47         2           70     35.0      0.0      delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
+    48
+    49         2            3      1.5      0.0      X_data = X.data
+    50         2            2      1.0      0.0      X_indices = X.indices
+    51         2            2      1.0      0.0      X_indptr = X.indptr
+    52
+    53      8002        12721      1.6      0.0      for d in xrange(n_docs):
+    54      8000        25173      3.1      0.1          ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
+    55      8000        19870      2.5      0.1          cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
+    56      8000        30900      3.9      0.1          gammad = gamma[d, :]
+    57      8000        26641      3.3      0.1          expElogthetad = expElogtheta[d, :]
+    58      8000       104626     13.1      0.3          expElogbetad = expElogbeta[:, ids]
+    59                                                   # The optimal phi_{dwk} is proportional to
+    60                                                   # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
+    61      8000        79777     10.0      0.2          phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    62
+    63                                                   # Iterate between gamma and phi until convergence
+    64    381325       467124      1.2      1.3          for it in xrange(0, max_iters):
+    65    379940       565605      1.5      1.5              lastgamma = gammad
+    66                                                       # We represent phi implicitly to save memory and time.
+    67                                                       # Substituting the value of the optimal phi back into
+    68                                                       # the update for gamma gives this update. Cf. Lee&Seung 2001.
+    69    379940       428819      1.1      1.2              gammad = alpha + expElogthetad * \
+    70    379940      5605904     14.8     15.1                  np.dot(cnts / phinorm, expElogbetad.T)
+    71    379940     12712990     33.5     34.2              expElogthetad = np.exp(_dirichlet_expectation(gammad))
+    72    379940      3375137      8.9      9.1              phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
+    73
+    74    379940     12524287     33.0     33.7              meanchange = np.mean(abs(gammad - lastgamma))
+    75    379940       620657      1.6      1.7              if (meanchange < meanchangethresh):
+    76      6615         8065      1.2      0.0                  break
+    77      8000        50140      6.3      0.1          gamma[d, :] = gammad
+    78                                                   # Contribution of document d to the expected sufficient
+    79                                                   # statistics for the M step.
+    80      8000         9904      1.2      0.0          if cal_delta:
+    81      8000       447906     56.0      1.2              delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
+    82
+    83         2            3      1.5      0.0      return (gamma, delta_component)