# Examples for python and Spark [Link](https://github.com/apache/spark/tree/branch-1.2/examples/src/main/python) - Word Count ```python import sys from operator import add from pyspark import SparkContext if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: wordcount " exit(-1) sc = SparkContext(appName="PythonWordCount") lines = sc.textFile(sys.argv[1], 1) counts = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (x, 1)) \ .reduceByKey(add) output = counts.collect() for (word, count) in output: print "%s: %i" % (word, count) sc.stop() ``` - Word Count by Streaming ```python """ Counts words in new text files created in the given directory Usage: hdfs_wordcount.py is the directory that Spark Streaming will use to find and read new text files. To run this on your local machine on directory `localdir`, run this example $ bin/spark-submit examples/src/main/python/streaming/hdfs_wordcount.py localdir Then create a text file in `localdir` and the words in the file will get counted. """ import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: hdfs_wordcount.py " exit(-1) sc = SparkContext(appName="PythonStreamingHDFSWordCount") ssc = StreamingContext(sc, 1) lines = ssc.textFileStream(sys.argv[1]) counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda x: (x, 1))\ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination() ``` - Sort ```python import sys from pyspark import SparkContext if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: sort " exit(-1) sc = SparkContext(appName="PythonSort") lines = sc.textFile(sys.argv[1], 1) sortedCount = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (int(x), 1)) \ .sortByKey(lambda x: x) # This is just a demo on how to bring all the sorted data back to a single node. # In reality, we wouldn't want to collect all the data to the driver node. output = sortedCount.collect() for (num, unitcount) in output: print num sc.stop() ```