I hereby claim:
- I am mattfaus on github.
- I am mattfaus (https://keybase.io/mattfaus) on keybase.
- I have a public key whose fingerprint is 1CF5 6643 9369 2689 9402 2358 69E8 0354 58E5 E154
To claim this, I am signing this object:
| import db_util | |
| db_util.enable_db_protobuf_projection() | |
| db_util.enable_ndb_protobuf_projection() |
I hereby claim:
To claim this, I am signing this object:
| class BatchedGcsCsvShardFileWriter(object): | |
| """Writes CSV data into multiple output shards, grouping rows by keys. | |
| This class is a context manager, which closes all shards upon exit. | |
| Say you are writing a lot of CSV data, like: | |
| [0, "Bakery"], | |
| [2, "Francisco"], | |
| [3, "Matt"], |
| class SortedGcsCsvShardFileMergeReader(object): | |
| """Merges several sorted .csv files stored on GCS. | |
| This class is both an iterator and a context manager. | |
| Let's say there are 2 .csv files stored on GCS, with contents like: | |
| /bucket/file_1.csv: | |
| [0, "Matt"], | |
| [0, "Sam"], |
| class ParallelInMemorySortGcsCsvShardFiles(pipeline.Pipeline): | |
| def run(self, input_bucket, input_pattern, sort_columns, | |
| model_type, output_bucket, output_pattern): | |
| """Sorts each input file in-memory, then writes it to an output file. | |
| Arguments: | |
| input_bucket - The GCS bucket which contains the unsorted .csv | |
| files. | |
| input_pattern - A regular expression used to find files in the |
| class DeterministicCompressedFeatures(CompressedFeatures): | |
| """Generates random components after seeding with the component_key. | |
| By using a known seed to generate the random components, we do not need to | |
| store or manage them. We can just recompute them whenever we need. | |
| """ | |
| def __init__(self, num_features=RANDOM_FEATURE_LENGTH): | |
| super(DeterministicallyRandomFeatures, self).__init__(num_features) |
| { | |
| u 'fields': [{ | |
| u 'type': u 'STRING', | |
| u 'name': u 'playlists', | |
| u 'mode': u 'REPEATED' | |
| }, { | |
| u 'type': u 'STRING', | |
| u 'name': u 'source_table', | |
| u 'mode': u 'NULLABLE' | |
| }, { |
| def get_table_schema(dataset, table): | |
| """If the table exists, returns its schema. Otherwise, returns None.""" | |
| table_service = BigQueryService.get_service().tables() | |
| try: | |
| get_result = table_service.get( | |
| projectId=BQ_PROJECT_ID, | |
| datasetId=dataset, | |
| tableId=table | |
| ).execute() | |
| return get_result['schema'] |
| import collections | |
| import jinja2 | |
| import logging | |
| import os | |
| import request_handler | |
| import third_party.mapreduce | |
| import third_party.mapreduce.input_readers | |
| import third_party.mapreduce.output_writers | |
| import third_party.mapreduce.lib.files | |
| import third_party.mapreduce.operation |
| class TransformedVideoTranslationInfo(bq_property_transform.TransformedEntity): | |
| CUSTOM_SCHEMAS = { | |
| 'translated_youtube_ids': { | |
| 'name': 'translated_youtube_ids', | |
| 'type': 'record', | |
| 'mode': 'repeated', | |
| 'fields': [ | |
| {'name': 'language', | |
| 'type': 'string'}, |