phucnguyenvn · January 14, 2020 07:32 · Jan 14, 2020
diff --git a/pool.py b/pool.py
@@ -0,0 +1,30 @@
+    # Read CSV file and convert to parquet, upload to S3
+    def csv_manipulation(self, merchant: str, directory: str):
+        temp = []
+        schema = pyarrow.schema(self.get_schema())
+        p = Pool(processes=self.limit_process)
+
+        for f in os.listdir(directory):
+            filepath = os.path.join(directory, f)
+            try: 
+                with open(filepath, 'r', encoding='ISO-8859-1') as csvfile: 
+                    reader = csv.reader(csvfile)
+                    for row in reader:
+                        if len(temp) < self.chunk_size:
+                            temp.append(self.calculate_stat(row))
+                        else:
+                            p.apply_async(self.upload, args=(temp, merchant, schema))
+                            temp = []
+                            temp.append(self.calculate_stat(row))
+            except csv.Error: 
+                # skip error when CSV is empty
+                pass
+
+
+        if len(temp) > 0:
+            p.apply_async(self.upload, args=(temp, merchant, schema))
+            temp = []
+
+        p.close()
+        p.join()
+        return
No results found