Skip to content

Instantly share code, notes, and snippets.

View canyousayyes's full-sized avatar

Gene Ng canyousayyes

View GitHub Profile
from scrapy.http import HtmlResponse
# ...
class TechcrunchSpider(CrawlSpider):
# ...
def parse_item(self, response):
json_res = json.loads(response.body)
if not isinstance(json_res, list) or len(json_res) < 1:
return None
# -*- coding: utf-8 -*-
from scrapy.loader import ItemLoader
from web_scraper.items import Article
import json
#...
class TechcrunchSpider(CrawlSpider):
# ...
def parse_item(self, response):
# -*- coding: utf-8 -*-
from scrapy import Item, Field
from scrapy.loader.processors import TakeFirst, Join, Compose
class Article(Item):
title = Field(output_processor=TakeFirst())
publish_date = Field(output_processor=TakeFirst())
content = Field(
output_processor=Compose(lambda v: filter(None, v), Join(''))
# -*- coding: utf-8 -*-
import re
# ...
def process_value(value):
match = re.search(r'\d+/\d+/\d+/(.+)/', value)
if not match:
return None
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TechcrunchSpider(CrawlSpider):
name = 'techcrunch'
allowed_domains = ['techcrunch.com']
start_urls = ['http://techcrunch.com/']
---
Description: "EMR cluster to run Apache Beam App in Spark."
Resources:
...
EmrCluster:
Type: "AWS::EMR::Cluster"
Properties:
...
JobFlowRole: EMR_EC2_DefaultRole
<project ...>
<build>
<pluginManagement>
<!-- NOT THIS! -->
<plugins>
...
</plugins>
</pluginManagement>
<!-- BUT THIS! Put the maven-shade-plugin under this tag -->
<!-- Running Beam in Spark -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-spark</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.0</version>
object App {
private final val DEFAULT_AWS_REGION = "us-west-2"
trait AppOptions extends PipelineOptions {
...
@Description("AWS region")
def getAwsRegion: String
def setAwsRegion(value: String): Unit
}
<!-- AWS -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-io-amazon-web-services</artifactId>
<version>2.9.0</version>
</dependency>