Created
July 18, 2018 19:18
-
-
Save andrewrgoss/8b6f22af2f54b5b63043c070dfefed8b to your computer and use it in GitHub Desktop.
GCP Leveraging Unstructured Data - Lab 3: Submit Dataproc jobs for unstructured data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Connected, host fingerprint: ssh-rsa 2048 F3:7F:24:6D:E9:7B:B1:16:6C:D8:49:A7:CF:C0:7A:23:25 | |
| :EB:72:AF | |
| The programs included with the Debian GNU/Linux system are free software; | |
| the exact distribution terms for each program are described in the | |
| individual files in /usr/share/doc/*/copyright. | |
| Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent | |
| permitted by applicable law. | |
| google710312_student@dataproc-cluster-m:~$ cd | |
| google710312_student@dataproc-cluster-m:~$ cp -r /training . | |
| google710312_student@dataproc-cluster-m:~$ ls | |
| training | |
| google710312_student@dataproc-cluster-m:~$ hadoop fs -ls / | |
| 18/07/18 18:32:03 INFO gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| Found 3 items | |
| drwxrwxrwt - mapred hadoop 0 2018-07-18 18:21 /mapred | |
| drwxrwxrwt - mapred hadoop 0 2018-07-18 18:21 /tmp | |
| drwxrwxrwt - hdfs hadoop 0 2018-07-18 18:21 /user | |
| google710312_student@dataproc-cluster-m:~$ cd ~/training | |
| google710312_student@dataproc-cluster-m:~/training$ ls | |
| road-not-taken.txt sherlock-holmes.txt training-data-analyst | |
| google710312_student@dataproc-cluster-m:~/training$ hadoop fs -mkdir /sampledata | |
| 18/07/18 18:33:04 INFO gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| google710312_student@dataproc-cluster-m:~/training$ hadoop fs -copyFromLocal road-not-taken.txt /sampledata/. | |
| 18/07/18 18:33:07 INFO gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| google710312_student@dataproc-cluster-m:~/training$ hadoop fs -copyFromLocal sherlock-holmes.txt /sampledata/. | |
| 18/07/18 18:33:19 INFO gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| google710312_student@dataproc-cluster-m:~/training$ holmes.txt /sampledata/. | |
| -bash: holmes.txt: command not found | |
| google710312_student@dataproc-cluster-m:~/training$ hadoop fs -ls /sampledata | |
| 18/07/18 18:34:27 INFO gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| Found 2 items | |
| -rw-r--r-- 2 google710312_student hadoop 729 2018-07-18 18:33 /sampledata/road-not-taken.txt | |
| -rw-r--r-- 2 google710312_student hadoop 574845 2018-07-18 18:33 /sampledata/sherlock-holmes.txt | |
| google710312_student@dataproc-cluster-m:~/training$ pyspark | |
| Python 2.7.9 (default, Jun 29 2016, 13:08:31) | |
| [GCC 4.9.2] on linux2 | |
| Type "help", "copyright", "credits" or "license" for more information. | |
| Setting default log level to "WARN". | |
| To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
| ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used | |
| Welcome to | |
| ____ __ | |
| / __/__ ___ _____/ /__ | |
| _\ \/ _ \/ _ `/ __/ '_/ | |
| /__ / .__/\_,_/_/ /_/\_\ version 2.2.1 | |
| /_/ | |
| Using Python version 2.7.9 (default, Jun 29 2016 13:08:31) | |
| SparkSession available as 'spark'. | |
| >>> lines = sc.textFile("/sampledata/sherlock-holmes.txt") | |
| >>> type(lines) | |
| <class 'pyspark.rdd.RDD'> | |
| >>> lines.count() | |
| 12652 | |
| >>> lines.take(15) | |
| [u'', u'THE ADVENTURES OF SHERLOCK HOLMES by ARTHUR CONAN DOYLE', u'', u'', u'', u'', u'A Scandal in Bohemia', u'The Red-headed League', u'A Case of Identity', u'The Boscombe Valley Mystery', u'The Five Orange Pips', u'The Man with the Twisted Lip', u'The Adventure of the Blue Carbuncle', u'The Adventure of the Speckled Band', u"The Adventure of the Engineer's Thumb"] | |
| >>> words = lines.flatMap(lambda x: x.split(' ')) | |
| >>> type(words) | |
| <class 'pyspark.rdd.PipelinedRDD'> | |
| >>> words.count() | |
| 107265 | |
| >>> words.take(15) | |
| [u'', u'THE', u'ADVENTURES', u'OF', u'SHERLOCK', u'HOLMES', u'by', u'ARTHUR', u'CONAN', u'DOYLE', u'', u'', u'', u'', u'A'] | |
| >>> pairs = words.map(lambda x: (x,len(x))) | |
| >>> type(pairs) | |
| <class 'pyspark.rdd.PipelinedRDD'> | |
| >>> pairs.count() | |
| 107265 | |
| >>> pairs.take(5) | |
| [(u'', 0), (u'THE', 3), (u'ADVENTURES', 10), (u'OF', 2), (u'SHERLOCK', 8)] | |
| >>> pairs = words.map(lambda x: (len(x),1)) | |
| >>> pairs.take(5) | |
| [(0, 1), (3, 1), (10, 1), (2, 1), (8, 1)] | |
| >>> from operator import add | |
| >>> wordsize = pairs.reduceByKey(add) | |
| >>> type(wordsize) | |
| <class 'pyspark.rdd.PipelinedRDD'> | |
| >>> wordsize.count() | |
| 22 | |
| >>> wordsize.take(5) | |
| [(0, 2756), (2, 18052), (4, 19456), (6, 8622), (8, 4664)] | |
| >>> output = wordsize.collect() | |
| >>> type(output) | |
| <type 'list'> | |
| >>> for (size,count) in output: print(size, count) | |
| ... | |
| (0, 2756) | |
| (2, 18052) | |
| (4, 19456) | |
| (6, 8622) | |
| (8, 4664) | |
| (10, 1730) | |
| (12, 585) | |
| (14, 159) | |
| (16, 31) | |
| (18, 8) | |
| (20, 4) | |
| (1, 5141) | |
| (3, 22939) | |
| from pyspark.sql import SparkSession | |
| >>> lines = sc.textFile("/sampledata/sherlock-holmes.txt") | |
| >>> | |
| >>> words = lines.flatMap(lambda x: x.split(' ')) | |
| >>> pairs = words.map(lambda x: (len(x),1)) | |
| >>> wordsize = pairs.reduceByKey(add) | |
| >>> output = wordsize.sortByKey().collect() | |
| >>> output2 = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (len(x),1)).reduceByKey(add).sortByKey().collect() | |
| >>> | |
| >>> for (size, count) in output2: print(size, count) | |
| ... | |
| (0, 2756) | |
| (1, 5141) | |
| (2, 18052) | |
| (3, 22939) | |
| (4, 19456) | |
| (5, 12044) | |
| (6, 8622) | |
| (7, 6615) | |
| (8, 4664) | |
| (9, 2980) | |
| (10, 1730) | |
| (11, 1035) | |
| (12, 585) | |
| (13, 352) | |
| (14, 159) | |
| (15, 75) | |
| (16, 31) | |
| (17, 12) | |
| (18, 8) | |
| (19, 4) | |
| (20, 4) | |
| (21, 1) | |
| >>> exit() | |
| google710312_student@dataproc-cluster-m:~/training$ vi wordcount.py | |
| google710312_student@dataproc-cluster-m:~/training$ spark-submit wordcount.py | |
| Okay Google. | |
| 18/07/18 19:07:06 INFO org.spark_project.jetty.util.log: Logging initialized @2865ms | |
| 18/07/18 19:07:06 INFO org.spark_project.jetty.server.Server: jetty-9.3.z-SNAPSHOT | |
| 18/07/18 19:07:06 INFO org.spark_project.jetty.server.Server: Started @2958ms | |
| 18/07/18 19:07:06 INFO org.spark_project.jetty.server.AbstractConnector: Started ServerConnector@29eeb21d{HTTP/1.1,[http/1.1]}{0.0.0.0:4040} | |
| 18/07/18 19:07:06 INFO com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase: GHFS version: 1.6.7-hadoop2 | |
| 18/07/18 19:07:07 INFO org.apache.hadoop.yarn.client.RMProxy: Connecting to ResourceManager at dataproc-cluster-m/10.128.0.2:8032 | |
| 18/07/18 19:07:10 INFO org.apache.hadoop.yarn.client.api.impl.YarnClientImpl: Submitted application application_1531938045693_0002 | |
| 18/07/18 19:07:16 WARN org.apache.spark.sql.execution.streaming.FileStreamSink: Error while looking for metadata directory. | |
| ABOUT = 1 | |
| AGES = 2 | |
| ALL = 1 | |
| AND = 9 | |
| ANOTHER = 1 | |
| AS = 5 | |
| BACK. = 1 | |
| BE = 2 | |
| BECAUSE = 1 | |
| BENT = 1 | |
| BETTER = 1 | |
| BLACK. = 1 | |
| BOTH = 2 | |
| BY, = 1 | |
| CLAIM, = 1 | |
| COME = 1 | |
| COULD = 2 | |
| DAY! = 1 | |
| DIFFERENCE. = 1 | |
| DIVERGED = 2 | |
| DOUBTED = 1 | |
| DOWN = 1 | |
| EQUALLY = 1 | |
| EVER = 1 | |
| FAIR, = 1 | |
| FAR = 1 | |
| FIRST = 1 | |
| FOR = 2 | |
| GRASSY = 1 | |
| HAD = 2 | |
| HAS = 1 | |
| HAVING = 1 | |
| HENCE: = 1 | |
| HOW = 1 | |
| I- = 1 | |
| IF = 1 | |
| IN = 4 | |
| IT = 2 | |
| JUST = 1 | |
| KEPT = 1 | |
| KNOWING = 1 | |
| LAY = 1 | |
| LEADS = 1 | |
| LEAVES = 1 | |
| LESS = 1 | |
| LONG = 1 | |
| LOOKED = 1 | |
| MADE = 1 | |
| MORNING = 1 | |
| NO = 1 | |
| NOT = 1 | |
| OH, = 1 | |
| ON = 1 | |
| ONE = 3 | |
| OTHER, = 1 | |
| PASSING = 1 | |
| PERHAPS = 1 | |
| REALLY = 1 | |
| ROADS = 2 | |
| SAME, = 1 | |
| SHALL = 1 | |
| SHOULD = 1 | |
| SIGH = 1 | |
| SOMEWHERE = 1 | |
| SORRY = 1 | |
| STEP = 1 | |
| STOOD = 1 | |
| TELLING = 1 | |
| THAT = 3 | |
| THE = 8 | |
| THEM = 1 | |
| THEN = 1 | |
| THERE = 1 | |
| THIS = 1 | |
| THOUGH = 1 | |
| TO = 2 | |
| TOOK = 2 | |
| TRAVEL = 1 | |
| TRAVELED = 1 | |
| TRAVELER, = 1 | |
| TRODDEN = 1 | |
| TWO = 1 | |
| UNDERGROWTH; = 1 | |
| WANTED = 1 | |
| WAS = 1 | |
| WAY = 1 | |
| WAY, = 1 | |
| WEAR; = 1 | |
| WHERE = 1 | |
| WITH = 1 | |
| WOOD, = 2 | |
| WORN = 1 | |
| YELLOW = 1 | |
| YET = 1 | |
| 18/07/18 19:07:24 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark@29eeb21d{HTTP/1.1,[http/1.1]}{0 | |
| .0.0.0:4040} | |
| google710312_student@dataproc-cluster-m:~/training$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment