Skip to content

Instantly share code, notes, and snippets.

@clemsos
Last active February 27, 2024 10:15
Show Gist options
  • Select an option

  • Save clemsos/8668698 to your computer and use it in GitHub Desktop.

Select an option

Save clemsos/8668698 to your computer and use it in GitHub Desktop.

Revisions

  1. clemsos revised this gist Jan 28, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions csv_to_elastic_search_bulk_insert.py
    Original file line number Diff line number Diff line change
    @@ -8,8 +8,8 @@

    t0=time()

    # elastic search
    chunksize=1000
    # size of the bulk
    chunksize=5000

    # open csv file
    f = open(raw_data_path+csv_filename) # read csv
  2. clemsos revised this gist Jan 28, 2014. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion csv_to_elastic_search_bulk_insert.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@
    import csv
    from pyelasticsearch import ElasticSearch
    import pandas as pd
    from time import time
    @@ -21,13 +20,15 @@
    # init ElasticSearch
    es = ElasticSearch('http://localhost:9200/')

    # init index
    try :
    es.delete_index("weiboscope")
    except :
    pass

    es.create_index("weiboscope")

    # start bulk indexing
    print "now indexing %s..."%(csv_filename)

    for i,df in enumerate(csvfile):
  3. clemsos created this gist Jan 28, 2014.
    43 changes: 43 additions & 0 deletions csv_to_elastic_search_bulk_insert.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,43 @@
    import csv
    from pyelasticsearch import ElasticSearch
    import pandas as pd
    from time import time

    root_path="/home/clemsos/Dev/mitras/"
    raw_data_path=root_path+"data/"
    csv_filename="week10.csv"

    t0=time()

    # elastic search
    chunksize=1000

    # open csv file
    f = open(raw_data_path+csv_filename) # read csv

    # parse csv with pandas
    csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize)

    # init ElasticSearch
    es = ElasticSearch('http://localhost:9200/')

    try :
    es.delete_index("weiboscope")
    except :
    pass

    es.create_index("weiboscope")

    print "now indexing %s..."%(csv_filename)

    for i,df in enumerate(csvfile):
    print i
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
    es.bulk_index("weiboscope","tweet",list_records)
    except :
    print "error!, skiping some tweets sorry"
    pass

    print "done in %.3fs"%(time()-t0)