snoopysecurity · September 19, 2022 13:00 · Jun 27, 2016 · May 18, 2016 · Apr 8, 2016 · Apr 8, 2016
diff --git a/big_query_examples.md b/big_query_examples.md
@@ -6,10 +6,10 @@
 
 SELECT count(*)
 FROM (SELECT id, repo_name, path
-        FROM [bigquery-public-data:github_repos.files]
+        FROM [bigquery-public-data:github_repos.sample_files]
       ) AS F
 JOIN (SELECT id
-        FROM [bigquery-public-data:github_repos.contents]
+        FROM [bigquery-public-data:github_repos.sample_contents]
        WHERE NOT binary
          AND content CONTAINS 'This should never happen') AS C
 ON F.id = C.id;
@@ -25,10 +25,10 @@ FROM FLATTEN(
   (SELECT
   SPLIT(SPLIT(REGEXP_EXTRACT(content, r'.*import\s*[(]([^)]*)[)]'), '\n'), ';') AS line,
   FROM
-    (SELECT id, content FROM [bigquery-public-data:github_repos.contents]
+    (SELECT id, content FROM [bigquery-public-data:github_repos.sample_contents]
      WHERE REGEXP_MATCH(content, r'.*import\s*[(][^)]*[)]')) AS C
     JOIN EACH
-    (SELECT id FROM [bigquery-public-data:github_repos.files]
+    (SELECT id FROM [bigquery-public-data:github_repos.sample_files]
      WHERE path LIKE '%.go' GROUP BY id) AS F
     ON C.id = F.id), line)
 GROUP BY url
@@ -44,7 +44,7 @@ FROM (
   SELECT REGEXP_EXTRACT(line, r' ([a-z0-9\._]*)\.') package, id
   FROM (
      SELECT SPLIT(content, '\n') line, id
-     FROM [bigquery-public-data:github_repos.fh_contents_top400k_10pct]
+     FROM [bigquery-public-data:github_repos.sample_contents]
      WHERE content CONTAINS 'import'
      AND sample_path LIKE '%.java'
      HAVING LEFT(line, 6)='import'
@@ -64,7 +64,7 @@ REGEXP_REPLACE(REGEXP_REPLACE(word, r'^[^:]*:', ''), r':[^:]*$', '') as emoticon
 C.committer.time_sec as committed_at,
 count(*) as count
 FROM ( SELECT SPLIT(message, ' ') as word, committer.time_sec
-      FROM [bigquery-public-data:github_repos.commits]
+      FROM [bigquery-public-data:github_repos.sample_commits]
     ) as C
 where
 REGEXP_MATCH(word, r':[\w]+:') and

diff --git a/big_query_examples.md b/big_query_examples.md
@@ -1,15 +1,9 @@
-This table I like a lot! 10% of the files of the top 400k projects:
-[bigquery-public-data:github_repos.fh_contents_top400k_10pct]
-22.4 GB
-~2M rows
-
-- fh_* is only a temporary name, so you can tell which tables are the result of my experiments
-- we should develop all of our sample queries on the 10% files of the top 400k projects. With 22.4GB, the users free quota will only exhaust after ~40 queries (but replenishes every day, and each extra query is not that expensive)
-- the huge contents table will still be available for anyone wanting to play with them (but won't fall under the free quota)
-
 ### How many times shouldn't it happen...
 
 ```sql
+
+-- https://news.ycombinator.com/item?id=11396045
+
 SELECT count(*)
 FROM (SELECT id, repo_name, path
         FROM [bigquery-public-data:github_repos.files]

diff --git a/big_query_examples.md b/big_query_examples.md
@@ -1,3 +1,12 @@
+This table I like a lot! 10% of the files of the top 400k projects:
+[bigquery-public-data:github_repos.fh_contents_top400k_10pct]
+22.4 GB
+~2M rows
+
+- fh_* is only a temporary name, so you can tell which tables are the result of my experiments
+- we should develop all of our sample queries on the 10% files of the top 400k projects. With 22.4GB, the users free quota will only exhaust after ~40 queries (but replenishes every day, and each extra query is not that expensive)
+- the huge contents table will still be available for anyone wanting to play with them (but won't fall under the free quota)
+
 ### How many times shouldn't it happen...
 
 ```sql
@@ -33,6 +42,26 @@ HAVING url IS NOT NULL
 ORDER BY count DESC
 ```
 
+### Most commonly-used Java packages
+
+```sql
+SELECT package, COUNT(*) c
+FROM (
+  SELECT REGEXP_EXTRACT(line, r' ([a-z0-9\._]*)\.') package, id
+  FROM (
+     SELECT SPLIT(content, '\n') line, id
+     FROM [bigquery-public-data:github_repos.fh_contents_top400k_10pct]
+     WHERE content CONTAINS 'import'
+     AND sample_path LIKE '%.java'
+     HAVING LEFT(line, 6)='import'
+  )
+  GROUP BY package, id
+)
+GROUP BY 1
+ORDER BY c DESC
+LIMIT 500;
+```
+
 ### Emoji used in commit messages
 
 ```sql
@@ -51,16 +80,21 @@ order by count DESC
 LIMIT 100;
 ```
 
-### Most commits for .edu contributors (currently doesn't work)
+### Most commits for .edu contributors
 
 ```sql
-SELECT
-  REGEXP_EXTRACT(author.email, r'@(.+)') as email_domain,
-  count(*) as count
-FROM
-  [bigquery-public-data:github_repos.commits]
-WHERE TLD(REGEXP_EXTRACT(author.email, r'@(.+)')) = '.edu'
-GROUP by 1
-ORDER by count DESC
-LIMIT 50;
-```
+
+-- Note this uses the githubarchive tables
+
+SELECT REGEXP_EXTRACT(email, r'@(.*)') domain, COUNT(*) c
+FROM (
+  SELECT REGEXP_EXTRACT(payload, r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)') email
+  FROM [githubarchive:month.201512]
+  WHERE REGEXP_EXTRACT(payload, r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)') IS NOT null
+  GROUP BY 1
+  HAVING email CONTAINS '.edu'
+)
+GROUP BY 1
+ORDER BY 2 DESC
+LIMIT 500;
+```
diff --git a/big_query_examples.md b/big_query_examples.md
@@ -12,6 +12,27 @@ JOIN (SELECT id
 ON F.id = C.id;
 ```
 
+### Most commonly-used Go packages
+
+```sql
+SELECT
+  REGEXP_EXTRACT(line, r'"([^"]+)"') AS url,
+  COUNT(*) AS count
+FROM FLATTEN(
+  (SELECT
+  SPLIT(SPLIT(REGEXP_EXTRACT(content, r'.*import\s*[(]([^)]*)[)]'), '\n'), ';') AS line,
+  FROM
+    (SELECT id, content FROM [bigquery-public-data:github_repos.contents]
+     WHERE REGEXP_MATCH(content, r'.*import\s*[(][^)]*[)]')) AS C
+    JOIN EACH
+    (SELECT id FROM [bigquery-public-data:github_repos.files]
+     WHERE path LIKE '%.go' GROUP BY id) AS F
+    ON C.id = F.id), line)
+GROUP BY url
+HAVING url IS NOT NULL
+ORDER BY count DESC
+```
+
 ### Emoji used in commit messages
 
 ```sql

diff --git a/big_query_examples.md b/big_query_examples.md
@@ -1,19 +1,4 @@
-T
-
-```sql
-SELECT F.repo_name, F.path
-FROM (SELECT id, repo_name, path
-        FROM [bigquery-public-data:github_repos.files]
-      ) AS F
-JOIN (SELECT id
-        FROM [bigquery-public-data:github_repos.contents]
-       WHERE NOT binary
-         AND content CONTAINS 'This should never happen') AS C
-ON F.id = C.id
-LIMIT 5;
-```
-
-or: how many times shouldn't it happen
+### How many times shouldn't it happen...
 
 ```sql
 SELECT count(*)
@@ -27,7 +12,7 @@ JOIN (SELECT id
 ON F.id = C.id;
 ```
 
-Emoji in commit messages:
+### Emoji used in commit messages
 
 ```sql
 SELECT
@@ -45,7 +30,7 @@ order by count DESC
 LIMIT 100;
 ```
 
-Top .edu contributor
+### Most commits for .edu contributors (currently doesn't work)
 
 ```sql
 SELECT

diff --git a/big_query_examples.md b/big_query_examples.md
@@ -0,0 +1,60 @@
+T
+
+```sql
+SELECT F.repo_name, F.path
+FROM (SELECT id, repo_name, path
+        FROM [bigquery-public-data:github_repos.files]
+      ) AS F
+JOIN (SELECT id
+        FROM [bigquery-public-data:github_repos.contents]
+       WHERE NOT binary
+         AND content CONTAINS 'This should never happen') AS C
+ON F.id = C.id
+LIMIT 5;
+```
+
+or: how many times shouldn't it happen
+
+```sql
+SELECT count(*)
+FROM (SELECT id, repo_name, path
+        FROM [bigquery-public-data:github_repos.files]
+      ) AS F
+JOIN (SELECT id
+        FROM [bigquery-public-data:github_repos.contents]
+       WHERE NOT binary
+         AND content CONTAINS 'This should never happen') AS C
+ON F.id = C.id;
+```
+
+Emoji in commit messages:
+
+```sql
+SELECT
+REGEXP_REPLACE(REGEXP_REPLACE(word, r'^[^:]*:', ''), r':[^:]*$', '') as emoticon,
+C.committer.time_sec as committed_at,
+count(*) as count
+FROM ( SELECT SPLIT(message, ' ') as word, committer.time_sec
+      FROM [bigquery-public-data:github_repos.commits]
+    ) as C
+where
+REGEXP_MATCH(word, r':[\w]+:') and
+REGEXP_REPLACE(REGEXP_REPLACE(word, r'^[^:]*:', ''), r':[^:]*$', '') in ("+1", "-1", "100", "1234", "8ball", "a", "ab", "abc", "abcd", "accept", "aerial_tramway", "airplane", "alarm_clock", "alien", "ambulance", "anchor", "angel", "anger", "angry", "anguished", "ant", "apple", "aquarius", "aries", "arrow_backward", "arrow_double_down", "arrow_double_up", "arrow_down", "arrow_down_small", "arrow_forward", "arrow_heading_down", "arrow_heading_up", "arrow_left", "arrow_lower_left", "arrow_lower_right", "arrow_right", "arrow_right_hook", "arrow_up", "arrow_up_down", "arrow_up_small", "arrow_upper_left", "arrow_upper_right", "arrows_clockwise", "arrows_counterclockwise", "art", "articulated_lorry", "astonished", "atm", "b", "baby", "baby_bottle", "baby_chick", "baby_symbol", "back", "baggage_claim", "balloon", "ballot_box_with_check", "bamboo", "banana", "bangbang", "bank", "bar_chart", "barber", "baseball", "basketball", "bath", "bathtub", "battery", "bear", "bee", "beer", "beers", "beetle", "beginner", "bell", "bento", "bicyclist", "bike", "bikini", "bird", "birthday", "black_circle", "black_joker", "black_medium_small_square", "black_medium_square", "black_nib", "black_small_square", "black_square", "black_square_button", "blossom", "blowfish", "blue_book", "blue_car", "blue_heart", "blush", "boar", "boat", "bomb", "book", "bookmark", "bookmark_tabs", "books", "boom", "boot", "bouquet", "bow", "bowling", "bowtie", "boy", "bread", "bride_with_veil", "bridge_at_night", "briefcase", "broken_heart", "bug", "bulb", "bullettrain_front", "bullettrain_side", "bus", "busstop", "bust_in_silhouette", "busts_in_silhouette", "cactus", "cake", "calendar", "calling", "camel", "camera", "cancer", "candy", "capital_abcd", "capricorn", "car", "card_index", "carousel_horse", "cat", "cat2", "cd", "chart", "chart_with_downwards_trend", "chart_with_upwards_trend", "checkered_flag", "cherries", "cherry_blossom", "chestnut", "chicken", "children_crossing", "chocolate_bar", "christmas_tree", "church", "cinema", "circus_tent", "city_sunrise", "city_sunset", "cl", "clap", "clapper", "clipboard", "clock1", "clock10", "clock1030", "clock11", "clock1130", "clock12", "clock1230", "clock130", "clock2", "clock230", "clock3", "clock330", "clock4", "clock430", "clock5", "clock530", "clock6", "clock630", "clock7", "clock730", "clock8", "clock830", "clock9", "clock930", "closed_book", "closed_lock_with_key", "closed_umbrella", "cloud", "clubs", "cn", "cocktail", "coffee", "cold_sweat", "collision", "computer", "confetti_ball", "confounded", "confused", "congratulations", "construction", "construction_worker", "convenience_store", "cookie", "cool", "cop", "copyright", "corn", "couple", "couple_with_heart", "couplekiss", "cow", "cow2", "credit_card", "crocodile", "crossed_flags", "crown", "cry", "crying_cat_face", "crystal_ball", "cupid", "curly_loop", "currency_exchange", "curry", "custard", "customs", "cyclone", "dancer", "dancers", "dango", "dart", "dash", "date", "de", "deciduous_tree", "department_store", "diamond_shape_with_a_dot_inside", "diamonds", "disappointed", "disappointed_relieved", "dizzy", "dizzy_face", "do_not_litter", "dog", "dog2", "dollar", "dolls", "dolphin", "donut", "door", "doughnut", "dragon", "dragon_face", "dress", "dromedary_camel", "droplet", "dvd", "e\\-mail", "ear", "ear_of_rice", "earth_africa", "earth_americas", "earth_asia", "egg", "eggplant", "eight", "eight_pointed_black_star", "eight_spoked_asterisk", "electric_plug", "elephant", "email", "end", "envelope", "es", "euro", "european_castle", "european_post_office", "evergreen_tree", "exclamation", "expressionless", "eyeglasses", "eyes", "facepunch", "factory", "fallen_leaf", "family", "fast_forward", "fax", "fearful", "feelsgood", "feet", "ferris_wheel", "file_folder", "finnadie", "fire", "fire_engine", "fireworks", "first_quarter_moon", "first_quarter_moon_with_face", "fish", "fish_cake", "fishing_pole_and_fish", "fist", "five", "flags", "flashlight", "floppy_disk", "flower_playing_cards", "flushed", "foggy", "football", "fork_and_knife", "fountain", "four", "four_leaf_clover", "fr", "free", "fried_shrimp", "fries", "frog", "frowning", "fu", "fuelpump", "full_moon", "full_moon_with_face", "game_die", "gb", "gem", "gemini", "ghost", "gift", "gift_heart", "girl", "globe_with_meridians", "goat", "goberserk", "godmode", "golf", "grapes", "green_apple", "green_book", "green_heart", "grey_exclamation", "grey_question", "grimacing", "grin", "grinning", "guardsman", "guitar", "gun", "haircut", "hamburger", "hammer", "hamster", "hand", "handbag", "hankey", "hash", "hatched_chick", "hatching_chick", "headphones", "hear_no_evil", "heart", "heart_decoration", "heart_eyes", "heart_eyes_cat", "heartbeat", "heartpulse", "hearts", "heavy_check_mark", "heavy_division_sign", "heavy_dollar_sign", "heavy_exclamation_mark", "heavy_minus_sign", "heavy_multiplication_x", "heavy_plus_sign", "helicopter", "herb", "hibiscus", "high_brightness", "high_heel", "hocho", "honey_pot", "honeybee", "horse", "horse_racing", "hospital", "hotel", "hotsprings", "hourglass", "hourglass_flowing_sand", "house", "house_with_garden", "hurtrealbad", "hushed", "ice_cream", "icecream", "id", "ideograph_advantage", "imp", "inbox_tray", "incoming_envelope", "information_desk_person", "information_source", "innocent", "interrobang", "iphone", "it", "izakaya_lantern", "jack_o_lantern", "japan", "japanese_castle", "japanese_goblin", "japanese_ogre", "jeans", "joy", "joy_cat", "jp", "key", "keycap_ten", "kimono", "kiss", "kissing", "kissing_cat", "kissing_closed_eyes", "kissing_face", "kissing_heart", "kissing_smiling_eyes", "koala", "koko", "kr", "large_blue_circle", "large_blue_diamond", "large_orange_diamond", "last_quarter_moon", "last_quarter_moon_with_face", "laughing", "leaves", "ledger", "left_luggage", "left_right_arrow", "leftwards_arrow_with_hook", "lemon", "leo", "leopard", "libra", "light_rail", "link", "lips", "lipstick", "lock", "lock_with_ink_pen", "lollipop", "loop", "loudspeaker", "love_hotel", "love_letter", "low_brightness", "m", "mag", "mag_right", "mahjong", "mailbox", "mailbox_closed", "mailbox_with_mail", "mailbox_with_no_mail", "man", "man_with_gua_pi_mao", "man_with_turban", "mans_shoe", "maple_leaf", "mask", "massage", "meat_on_bone", "mega", "melon", "memo", "mens", "metal", "metro", "microphone", "microscope", "milky_way", "minibus", "minidisc", "mobile_phone_off", "money_with_wings", "moneybag", "monkey", "monkey_face", "monorail", "moon", "mortar_board", "mount_fuji", "mountain_bicyclist", "mountain_cableway", "mountain_railway", "mouse", "mouse2", "movie_camera", "moyai", "muscle", "mushroom", "musical_keyboard", "musical_note", "musical_score", "mute", "nail_care", "name_badge", "neckbeard", "necktie", "negative_squared_cross_mark", "neutral_face", "new", "new_moon", "new_moon_with_face", "newspaper", "ng", "nine", "no_bell", "no_bicycles", "no_entry", "no_entry_sign", "no_good", "no_mobile_phones", "no_mouth", "no_pedestrians", "no_smoking", "non\\-potable_water", "nose", "notebook", "notebook_with_decorative_cover", "notes", "nut_and_bolt", "o", "o2", "ocean", "octocat", "octopus", "oden", "office", "ok", "ok_hand", "ok_woman", "older_man", "older_woman", "on", "oncoming_automobile", "oncoming_bus", "oncoming_police_car", "oncoming_taxi", "one", "open_file_folder", "open_hands", "open_mouth", "ophiuchus", "orange_book", "outbox_tray", "ox", "package", "page_facing_up", "page_with_curl", "pager", "palm_tree", "panda_face", "paperclip", "parking", "part_alternation_mark", "partly_sunny", "passport_control", "paw_prints", "peach", "pear", "pencil", "pencil2", "penguin", "pensive", "performing_arts", "persevere", "person_frowning", "person_with_blond_hair", "person_with_pouting_face", "phone", "pig", "pig2", "pig_nose", "pill", "pineapple", "pisces", "pizza", "plus1", "point_down", "point_left", "point_right", "point_up", "point_up_2", "police_car", "poodle", "poop", "post_office", "postal_horn", "postbox", "potable_water", "pouch", "poultry_leg", "pound", "pouting_cat", "pray", "princess", "punch", "purple_heart", "purse", "pushpin", "put_litter_in_its_place", "question", "rabbit", "rabbit2", "racehorse", "radio", "radio_button", "rage", "rage1", "rage2", "rage3", "rage4", "railway_car", "rainbow", "raised_hand", "raised_hands", "raising_hand", "ram", "ramen", "rat", "recycle", "red_car", "red_circle", "registered", "relaxed", "relieved", "repeat", "repeat_one", "restroom", "revolving_hearts", "rewind", "ribbon", "rice", "rice_ball", "rice_cracker", "rice_scene", "ring", "rocket", "roller_coaster", "rooster", "rose", "rotating_light", "round_pushpin", "rowboat", "ru", "rugby_football", "runner", "running", "running_shirt_with_sash", "sa", "sagittarius", "sailboat", "sake", "sandal", "santa", "satellite", "satisfied", "saxophone", "school", "school_satchel", "scissors", "scorpius", "scream", "scream_cat", "scroll", "seat", "secret", "see_no_evil", "seedling", "seven", "shaved_ice", "sheep", "shell", "ship", "shipit", "shirt", "shit", "shoe", "shower", "signal_strength", "six", "six_pointed_star", "ski", "skull", "sleeping", "sleepy", "slot_machine", "small_blue_diamond", "small_orange_diamond", "small_red_triangle", "small_red_triangle_down", "smile", "smile_cat", "smiley", "smiley_cat", "smiling_imp", "smirk", "smirk_cat", "smoking", "snail", "snake", "snowboarder", "snowflake", "snowman", "sob", "soccer", "soon", "sos", "sound", "space_invader", "spades", "spaghetti", "sparkle", "sparkler", "sparkles", "sparkling_heart", "speak_no_evil", "speaker", "speech_balloon", "speedboat", "squirrel", "star", "star2", "stars", "station", "statue_of_liberty", "steam_locomotive", "stew", "straight_ruler", "strawberry", "stuck_out_tongue", "stuck_out_tongue_closed_eyes", "stuck_out_tongue_winking_eye", "sun_with_face", "sunflower", "sunglasses", "sunny", "sunrise", "sunrise_over_mountains", "surfer", "sushi", "suspect", "suspension_railway", "sweat", "sweat_drops", "sweat_smile", "sweet_potato", "swimmer", "symbols", "syringe", "tada", "tanabata_tree", "tangerine", "taurus", "taxi", "tea", "telephone", "telephone_receiver", "telescope", "tennis", "tent", "thought_balloon", "three", "thumbsdown", "thumbsup", "ticket", "tiger", "tiger2", "tired_face", "tm", "toilet", "tokyo_tower", "tomato", "tongue", "top", "tophat", "tractor", "traffic_light", "train", "train2", "tram", "triangular_flag_on_post", "triangular_ruler", "trident", "triumph", "trolleybus", "trollface", "trophy", "tropical_drink", "tropical_fish", "truck", "trumpet", "tshirt", "tulip", "turtle", "tv", "twisted_rightwards_arrows", "two", "two_hearts", "two_men_holding_hands", "two_women_holding_hands", "u5272", "u5408", "u55b6", "u6307", "u6708", "u6709", "u6e80", "u7121", "u7533", "u7981", "u7a7a", "uk", "umbrella", "unamused", "underage", "unlock", "up", "us", "v", "vertical_traffic_light", "vhs", "vibration_mode", "video_camera", "video_game", "violin", "virgo", "volcano", "vs", "walking", "waning_crescent_moon", "waning_gibbous_moon", "warning", "watch", "water_buffalo", "watermelon", "wave", "wavy_dash", "waxing_crescent_moon", "waxing_gibbous_moon", "wc", "weary", "wedding", "whale", "whale2", "wheelchair", "white_check_mark", "white_circle", "white_flower", "white_large_square", "white_medium_small_square", "white_medium_square", "white_small_square", "white_square_button", "wind_chime", "wine_glass", "wink", "wolf", "woman", "womans_clothes", "womans_hat", "womens", "worried", "wrench", "x", "yellow_heart", "yen", "yum", "zap", "zero", "zzz")
+group by 1, 2
+order by count DESC
+LIMIT 100;
+```
+
+Top .edu contributor
+
+```sql
+SELECT
+  REGEXP_EXTRACT(author.email, r'@(.+)') as email_domain,
+  count(*) as count
+FROM
+  [bigquery-public-data:github_repos.commits]
+WHERE TLD(REGEXP_EXTRACT(author.email, r'@(.+)')) = '.edu'
+GROUP by 1
+ORDER by count DESC
+LIMIT 50;
+```