-
-
Save rdegraci/c18c3f591cac1c7cbcc8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- | |
| -- Rodney Degracia | |
| -- | |
| -- July 2014 | |
| -- | |
| -- Script to find the frequency of books published each year | |
| -- | |
| -- http://www2.informatik.uni-freiburg.de/~cziegler/BX/ | |
| -- | |
| -- | |
| -- http://www.orzota.com/pig-for-beginners/ | |
| -- Clean up the data first | |
| -- sed -e 's/\&/\&/g' BX-Books.csv | sed -e '1d' | sed -e 's/;/$$$/g' | sed -e 's/"$$$"/";"/g' > BX-BooksCorrected.txt | |
| -- | |
| -- Hadoop-2.4.0 | |
| -- Pig-0.13.0 | |
| -- Debian 7.5 | |
| -- | |
| BookXRecords = LOAD '/hduser/BX-BooksCorrected.txt' | |
| USING PigStorage(';') AS (ISBN:chararray,BookTitle:chararray, | |
| BookAuthor:chararray,YearOfPublication:chararray, | |
| Publisher:chararray,ImageURLS:chararray,ImageURLM:chararray,ImageURLL:chararray); | |
| GroupByYear = GROUP BookXRecords BY YearOfPublication; | |
| CountByYear = FOREACH GroupByYear | |
| GENERATE CONCAT((chararray)$0,CONCAT(':',(chararray)COUNT($1))); | |
| STORE CountByYear | |
| INTO '/hduser/pig_output_bookx' USING PigStorage('t'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment