-
-
Save vishwassharma/45a2f73ef4570fb4ed39 to your computer and use it in GitHub Desktop.
Revisions
-
mattes revised this gist
Mar 20, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -26,7 +26,7 @@ def post(self): cursor = None while True: pages, next_cursor, more = pages_query.fetch_page(50, start_cursor=cursor) for page in pages: string = "" string += page.fetched.strftime("%Y-%m-%d %H:%M:%S") -
mattes revised this gist
Mar 20, 2014 . 1 changed file with 0 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -37,10 +37,6 @@ def post(self): string += "\n" gcs_file.write(string) if(more): cursor = next_cursor else: -
mattes revised this gist
Mar 20, 2014 . 1 changed file with 6 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,6 +12,9 @@ class CreateCSVHandler(webapp2.RequestHandler): def post(self): fetched = self.request.get('fetched') fetched = datetime.strptime(fetched, "%Y-%m-%d") ctx = ndb.get_context() ctx.set_cache_policy(lambda key: key.kind() != 'Page') pages_query = Page.query() pages_query = pages_query.filter(Page.fetched >= fetched) @@ -24,9 +27,6 @@ def post(self): cursor = None while True: pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor) for page in pages: string = "" string += page.fetched.strftime("%Y-%m-%d %H:%M:%S") @@ -47,3 +47,6 @@ def post(self): break gcs_file.close() gc.collect() ctx.clear_cache() -
mattes revised this gist
Mar 19, 2014 . 1 changed file with 7 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -24,6 +24,9 @@ def post(self): cursor = None while True: pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor) gc.collect() for page in pages: string = "" string += page.fetched.strftime("%Y-%m-%d %H:%M:%S") @@ -34,6 +37,10 @@ def post(self): string += "\n" gcs_file.write(string) gcs_file.close() logging.warning("stop after first batch!") return if(more): cursor = next_cursor else: -
mattes created this gist
Mar 19, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,42 @@ class InitCSVHandler(webapp2.RequestHandler): def get(self): fetched = self.request.get('fetched') if not fetched: self.response.write("no fetched date") return taskqueue.add(url='/create_csv', queue_name='csv', params={'fetched': fetched}) self.response.write("creating " + str(fetched)) class CreateCSVHandler(webapp2.RequestHandler): def post(self): fetched = self.request.get('fetched') fetched = datetime.strptime(fetched, "%Y-%m-%d") pages_query = Page.query() pages_query = pages_query.filter(Page.fetched >= fetched) pages_query = pages_query.filter(Page.fetched < fetched + timedelta(days=1)) filename = "/csv123/" + self.request.get('fetched') + ".csv" logging.info("filename: " + filename) gcs_file = gcs.open(filename, 'w', content_type="text/plain") cursor = None while True: pages, next_cursor, more = pages_query.fetch_page(10, start_cursor=cursor) for page in pages: string = "" string += page.fetched.strftime("%Y-%m-%d %H:%M:%S") string += ";" string += urllib.quote(page.url) string += ";" string += base64.b64encode(zlib.compress(page.html.encode('utf-8'), 9)) string += "\n" gcs_file.write(string) if(more): cursor = next_cursor else: break gcs_file.close()