Skip to content

Instantly share code, notes, and snippets.

@vishwassharma
Forked from mattes/gae_csv.py
Last active August 29, 2015 14:06
Show Gist options
  • Save vishwassharma/45a2f73ef4570fb4ed39 to your computer and use it in GitHub Desktop.
Save vishwassharma/45a2f73ef4570fb4ed39 to your computer and use it in GitHub Desktop.

Revisions

  1. @mattes mattes revised this gist Mar 20, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion gae_csv.py
    Original file line number Diff line number Diff line change
    @@ -26,7 +26,7 @@ def post(self):

    cursor = None
    while True:
    pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor)
    pages, next_cursor, more = pages_query.fetch_page(50, start_cursor=cursor)
    for page in pages:
    string = ""
    string += page.fetched.strftime("%Y-%m-%d %H:%M:%S")
  2. @mattes mattes revised this gist Mar 20, 2014. 1 changed file with 0 additions and 4 deletions.
    4 changes: 0 additions & 4 deletions gae_csv.py
    Original file line number Diff line number Diff line change
    @@ -37,10 +37,6 @@ def post(self):
    string += "\n"
    gcs_file.write(string)

    gcs_file.close()
    logging.warning("stop after first batch!")
    return

    if(more):
    cursor = next_cursor
    else:
  3. @mattes mattes revised this gist Mar 20, 2014. 1 changed file with 6 additions and 3 deletions.
    9 changes: 6 additions & 3 deletions gae_csv.py
    Original file line number Diff line number Diff line change
    @@ -12,6 +12,9 @@ class CreateCSVHandler(webapp2.RequestHandler):
    def post(self):
    fetched = self.request.get('fetched')
    fetched = datetime.strptime(fetched, "%Y-%m-%d")

    ctx = ndb.get_context()
    ctx.set_cache_policy(lambda key: key.kind() != 'Page')

    pages_query = Page.query()
    pages_query = pages_query.filter(Page.fetched >= fetched)
    @@ -24,9 +27,6 @@ def post(self):
    cursor = None
    while True:
    pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor)

    gc.collect()

    for page in pages:
    string = ""
    string += page.fetched.strftime("%Y-%m-%d %H:%M:%S")
    @@ -47,3 +47,6 @@ def post(self):
    break

    gcs_file.close()

    gc.collect()
    ctx.clear_cache()
  4. @mattes mattes revised this gist Mar 19, 2014. 1 changed file with 7 additions and 0 deletions.
    7 changes: 7 additions & 0 deletions gae_csv.py
    Original file line number Diff line number Diff line change
    @@ -24,6 +24,9 @@ def post(self):
    cursor = None
    while True:
    pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor)

    gc.collect()

    for page in pages:
    string = ""
    string += page.fetched.strftime("%Y-%m-%d %H:%M:%S")
    @@ -34,6 +37,10 @@ def post(self):
    string += "\n"
    gcs_file.write(string)

    gcs_file.close()
    logging.warning("stop after first batch!")
    return

    if(more):
    cursor = next_cursor
    else:
  5. @mattes mattes created this gist Mar 19, 2014.
    42 changes: 42 additions & 0 deletions gae_csv.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    class InitCSVHandler(webapp2.RequestHandler):
    def get(self):
    fetched = self.request.get('fetched')
    if not fetched:
    self.response.write("no fetched date")
    return

    taskqueue.add(url='/create_csv', queue_name='csv', params={'fetched': fetched})
    self.response.write("creating " + str(fetched))

    class CreateCSVHandler(webapp2.RequestHandler):
    def post(self):
    fetched = self.request.get('fetched')
    fetched = datetime.strptime(fetched, "%Y-%m-%d")

    pages_query = Page.query()
    pages_query = pages_query.filter(Page.fetched >= fetched)
    pages_query = pages_query.filter(Page.fetched < fetched + timedelta(days=1))

    filename = "/csv123/" + self.request.get('fetched') + ".csv"
    logging.info("filename: " + filename)
    gcs_file = gcs.open(filename, 'w', content_type="text/plain")

    cursor = None
    while True:
    pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor)
    for page in pages:
    string = ""
    string += page.fetched.strftime("%Y-%m-%d %H:%M:%S")
    string += ";"
    string += urllib.quote(page.url)
    string += ";"
    string += base64.b64encode(zlib.compress(page.html.encode('utf-8'), 9))
    string += "\n"
    gcs_file.write(string)

    if(more):
    cursor = next_cursor
    else:
    break

    gcs_file.close()