- 实词:名词、动词、形容词、状态词、区别词、数词、量词、代词
- 虚词:副词、介词、连词、助词、拟声词、叹词。
n 名词
nr 人名
| import numpy as np | |
| import marisa_trie | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.externals import six | |
| class MarisaCountVectorizer(CountVectorizer): | |
| # ``CountVectorizer.fit`` method calls ``fit_transform`` so | |
| # ``fit`` is not provided | |
| def fit_transform(self, raw_documents, y=None): |
| # Your init script | |
| # | |
| # Atom will evaluate this file each time a new window is opened. It is run | |
| # after packages are loaded/activated and after the previous editor state | |
| # has been restored. | |
| # | |
| # An example hack to log to the console when each text editor is saved. | |
| # | |
| # atom.workspace.observeTextEditors (editor) -> | |
| # editor.onDidSave -> |
| import tensorflow as tf | |
| import numpy as np | |
| if __name__ == '__main__': | |
| np.random.seed(1) | |
| # the size of the hidden state for the lstm (notice the lstm uses 2x of this amount so actually lstm will have state of size 2) | |
| size = 1 | |
| # 2 different sequences total | |
| batch_size= 2 | |
| # the maximum steps for both sequences is 10 |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction import DictVectorizer | |
| def encode_onehot(df, cols): | |
| """ | |
| One-hot encoding is applied to columns specified in a pandas DataFrame. | |
| Modified from: https://gist.github.com/kljensen/5452382 | |
| from pyspark import SparkContext | |
| import numpy as np | |
| from sklearn.cross_validation import train_test_split, Bootstrap | |
| from sklearn.datasets import make_classification | |
| from sklearn.metrics import accuracy_score | |
| from sklearn.tree import DecisionTreeClassifier | |
| def run(sc): |
| # Automaticlly install pptpd on Amazon EC2 Amazon Linux | |
| # | |
| # Ripped from http://blog.diahosting.com/linux-tutorial/pptpd/ | |
| # pptpd source rpm packing by it's authors | |
| # | |
| # WARNING: | |
| # first ms-dns setting to 172.16.0.23, 172.16.0.23 was showing on my | |
| # /etc/resolv.conf, I'm not sure this is the same on all Amazon AWS zones. | |
| # | |
| # You need to adjust your "Security Groups" which you are using too. |
| import multiprocessing | |
| import pandas as pd | |
| import numpy as np | |
| def _apply_df(args): | |
| df, func, kwargs = args | |
| return df.apply(func, **kwargs) | |
| def apply_by_multiprocessing(df, func, **kwargs): | |
| workers = kwargs.pop('workers') |
| #List unique values in a DataFrame column | |
| pd.unique(df.column_name.ravel()) | |
| #Convert Series datatype to numeric, getting rid of any non-numeric values | |
| df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
| #Grab DataFrame rows where column has certain values | |
| valuelist = ['value1', 'value2', 'value3'] | |
| df = df[df.column.isin(value_list)] |