andy51002000 · February 15, 2021 03:35 · Feb 15, 2021
diff --git a/dataflow example.py b/dataflow example.py
@@ -0,0 +1,22 @@
+class WordcountOptions(PipelineOptions):
+    @classmethod
+    def _add_argparse_args(cls, parser):
+      parser.add_argument(
+          '--input',
+          default='gs://dataflow-samples/shakespeare/kinglear.txt',
+          help='Path of the file to read from')
+      parser.add_argument(
+          '--output',
+          required=True,
+          help='Output file to write results to.')    
+
+pipeline_options = PipelineOptions(['--output', './result.txt'])
+p = beam.Pipeline(options=pipeline_options,runner=InteractiveRunner())
+
+wordcount_options = pipeline_options.view_as(WordcountOptions)
+
+count = (p
+         | 'ReadCollection' >> beam.io.ReadFromText(wordcount_options.input)
+         | 'findWord' >> beam.FlatMap(lambda line: re.findall(r'[\w\']+', line.strip(), re.UNICODE))
+         | "lower" >> beam.Map(lambda word: word.lower())
+         | "lower_count" >> beam.combiners.Count.PerElement())
No results found