kenhui521 · August 29, 2015 14:23 · Nov 21, 2014 · Nov 21, 2014 · Nov 20, 2014 · Nov 20, 2014
diff --git a/spark_py_examples.md b/spark_py_examples.md
@@ -4,7 +4,6 @@
 ```python
 import sys
 from operator import add
-
 from pyspark import SparkContext
 
 if __name__ == "__main__":
@@ -34,9 +33,7 @@ if __name__ == "__main__":
 
  Then create a text file in `localdir` and the words in the file will get counted.
 """
-
 import sys
-
 from pyspark import SparkContext
 from pyspark.streaming import StreamingContext
 
@@ -60,10 +57,8 @@ if __name__ == "__main__":
 - Sort 
 ```python
 import sys
-
 from pyspark import SparkContext
 
-
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print >> sys.stderr, "Usage: sort <file>"

diff --git a/spark_py_examples.md b/spark_py_examples.md
@@ -57,3 +57,27 @@ if __name__ == "__main__":
     ssc.start()
     ssc.awaitTermination()
 ```
+- Sort 
+```python
+import sys
+
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: sort <file>"
+        exit(-1)
+    sc = SparkContext(appName="PythonSort")
+    lines = sc.textFile(sys.argv[1], 1)
+    sortedCount = lines.flatMap(lambda x: x.split(' ')) \
+        .map(lambda x: (int(x), 1)) \
+        .sortByKey(lambda x: x)
+    # This is just a demo on how to bring all the sorted data back to a single node.
+    # In reality, we wouldn't want to collect all the data to the driver node.
+    output = sortedCount.collect()
+    for (num, unitcount) in output:
+        print num
+
+    sc.stop()
+```
diff --git a/spark_py_examples.md b/spark_py_examples.md
@@ -21,4 +21,39 @@ if __name__ == "__main__":
         print "%s: %i" % (word, count)
 
     sc.stop()
-```
+```
+- Word Count by Streaming
+```python
+"""
+ Counts words in new text files created in the given directory
+ Usage: hdfs_wordcount.py <directory>
+   <directory> is the directory that Spark Streaming will use to find and read new text files.
+
+ To run this on your local machine on directory `localdir`, run this example
+    $ bin/spark-submit examples/src/main/python/streaming/hdfs_wordcount.py localdir
+
+ Then create a text file in `localdir` and the words in the file will get counted.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: hdfs_wordcount.py <directory>"
+        exit(-1)
+
+    sc = SparkContext(appName="PythonStreamingHDFSWordCount")
+    ssc = StreamingContext(sc, 1)
+
+    lines = ssc.textFileStream(sys.argv[1])
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda x: (x, 1))\
+                  .reduceByKey(lambda a, b: a+b)
+    counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
+```
diff --git a/spark_py_examples.md b/spark_py_examples.md
@@ -0,0 +1,24 @@
+# Examples for python and Spark
+[Link](https://github.com/apache/spark/tree/branch-1.2/examples/src/main/python)
+- Word Count
+```python
+import sys
+from operator import add
+
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: wordcount <file>"
+        exit(-1)
+    sc = SparkContext(appName="PythonWordCount")
+    lines = sc.textFile(sys.argv[1], 1)
+    counts = lines.flatMap(lambda x: x.split(' ')) \
+                  .map(lambda x: (x, 1)) \
+                  .reduceByKey(add)
+    output = counts.collect()
+    for (word, count) in output:
+        print "%s: %i" % (word, count)
+
+    sc.stop()
+```
No results found