Last active
February 27, 2024 10:15
-
-
Save clemsos/8668698 to your computer and use it in GitHub Desktop.
Revisions
-
clemsos revised this gist
Jan 28, 2014 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,8 +8,8 @@ t0=time() # size of the bulk chunksize=5000 # open csv file f = open(raw_data_path+csv_filename) # read csv -
clemsos revised this gist
Jan 28, 2014 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,3 @@ from pyelasticsearch import ElasticSearch import pandas as pd from time import time @@ -21,13 +20,15 @@ # init ElasticSearch es = ElasticSearch('http://localhost:9200/') # init index try : es.delete_index("weiboscope") except : pass es.create_index("weiboscope") # start bulk indexing print "now indexing %s..."%(csv_filename) for i,df in enumerate(csvfile): -
clemsos created this gist
Jan 28, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,43 @@ import csv from pyelasticsearch import ElasticSearch import pandas as pd from time import time root_path="/home/clemsos/Dev/mitras/" raw_data_path=root_path+"data/" csv_filename="week10.csv" t0=time() # elastic search chunksize=1000 # open csv file f = open(raw_data_path+csv_filename) # read csv # parse csv with pandas csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) # init ElasticSearch es = ElasticSearch('http://localhost:9200/') try : es.delete_index("weiboscope") except : pass es.create_index("weiboscope") print "now indexing %s..."%(csv_filename) for i,df in enumerate(csvfile): print i records=df.where(pd.notnull(df), None).T.to_dict() list_records=[records[it] for it in records] try : es.bulk_index("weiboscope","tweet",list_records) except : print "error!, skiping some tweets sorry" pass print "done in %.3fs"%(time()-t0)