clemsos · February 27, 2024 10:15 · Jan 28, 2014 · Jan 28, 2014 · Jan 28, 2014
diff --git a/csv_to_elastic_search_bulk_insert.py b/csv_to_elastic_search_bulk_insert.py
@@ -8,8 +8,8 @@
 
 t0=time()
 
-# elastic search
-chunksize=1000
+# size of the bulk
+chunksize=5000
 
 # open csv file
 f = open(raw_data_path+csv_filename) # read csv

diff --git a/csv_to_elastic_search_bulk_insert.py b/csv_to_elastic_search_bulk_insert.py
@@ -1,4 +1,3 @@
-import csv
 from pyelasticsearch import ElasticSearch
 import pandas as pd
 from time import time
@@ -21,13 +20,15 @@
 # init ElasticSearch
 es = ElasticSearch('http://localhost:9200/')
 
+# init index
 try :
     es.delete_index("weiboscope")
 except :
     pass
 
 es.create_index("weiboscope")
 
+# start bulk indexing 
 print "now indexing %s..."%(csv_filename)
 
 for i,df in enumerate(csvfile): 

diff --git a/csv_to_elastic_search_bulk_insert.py b/csv_to_elastic_search_bulk_insert.py
@@ -0,0 +1,43 @@
+import csv
+from pyelasticsearch import ElasticSearch
+import pandas as pd
+from time import time
+
+root_path="/home/clemsos/Dev/mitras/"
+raw_data_path=root_path+"data/"
+csv_filename="week10.csv"
+
+t0=time()
+
+# elastic search
+chunksize=1000
+
+# open csv file
+f = open(raw_data_path+csv_filename) # read csv
+
+# parse csv with pandas
+csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 
+
+# init ElasticSearch
+es = ElasticSearch('http://localhost:9200/')
+
+try :
+    es.delete_index("weiboscope")
+except :
+    pass
+
+es.create_index("weiboscope")
+
+print "now indexing %s..."%(csv_filename)
+
+for i,df in enumerate(csvfile): 
+    print i
+    records=df.where(pd.notnull(df), None).T.to_dict()
+    list_records=[records[it] for it in records]
+    try :
+        es.bulk_index("weiboscope","tweet",list_records)
+    except :
+        print "error!, skiping some tweets sorry"
+        pass
+
+print "done in %.3fs"%(time()-t0)
No results found