This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from scrapy.http import HtmlResponse | |
| # ... | |
| class TechcrunchSpider(CrawlSpider): | |
| # ... | |
| def parse_item(self, response): | |
| json_res = json.loads(response.body) | |
| if not isinstance(json_res, list) or len(json_res) < 1: | |
| return None | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # -*- coding: utf-8 -*- | |
| from scrapy.loader import ItemLoader | |
| from web_scraper.items import Article | |
| import json | |
| #... | |
| class TechcrunchSpider(CrawlSpider): | |
| # ... | |
| def parse_item(self, response): | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # -*- coding: utf-8 -*- | |
| from scrapy import Item, Field | |
| from scrapy.loader.processors import TakeFirst, Join, Compose | |
| class Article(Item): | |
| title = Field(output_processor=TakeFirst()) | |
| publish_date = Field(output_processor=TakeFirst()) | |
| content = Field( | |
| output_processor=Compose(lambda v: filter(None, v), Join('')) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # -*- coding: utf-8 -*- | |
| import re | |
| # ... | |
| def process_value(value): | |
| match = re.search(r'\d+/\d+/\d+/(.+)/', value) | |
| if not match: | |
| return None | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # -*- coding: utf-8 -*- | |
| import scrapy | |
| from scrapy.linkextractors import LinkExtractor | |
| from scrapy.spiders import CrawlSpider, Rule | |
| class TechcrunchSpider(CrawlSpider): | |
| name = 'techcrunch' | |
| allowed_domains = ['techcrunch.com'] | |
| start_urls = ['http://techcrunch.com/'] | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | --- | |
| Description: "EMR cluster to run Apache Beam App in Spark." | |
| Resources: | |
| ... | |
| EmrCluster: | |
| Type: "AWS::EMR::Cluster" | |
| Properties: | |
| ... | |
| JobFlowRole: EMR_EC2_DefaultRole | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <project ...> | |
| <build> | |
| <pluginManagement> | |
| <!-- NOT THIS! --> | |
| <plugins> | |
| ... | |
| </plugins> | |
| </pluginManagement> | |
| <!-- BUT THIS! Put the maven-shade-plugin under this tag --> | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <!-- Running Beam in Spark --> | |
| <dependency> | |
| <groupId>org.apache.beam</groupId> | |
| <artifactId>beam-runners-spark</artifactId> | |
| <version>2.9.0</version> | |
| </dependency> | |
| <dependency> | |
| <groupId>org.apache.spark</groupId> | |
| <artifactId>spark-core_2.12</artifactId> | |
| <version>2.4.0</version> | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | object App { | |
| private final val DEFAULT_AWS_REGION = "us-west-2" | |
| trait AppOptions extends PipelineOptions { | |
| ... | |
| @Description("AWS region") | |
| def getAwsRegion: String | |
| def setAwsRegion(value: String): Unit | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <!-- AWS --> | |
| <dependency> | |
| <groupId>org.apache.beam</groupId> | |
| <artifactId>beam-sdks-java-io-amazon-web-services</artifactId> | |
| <version>2.9.0</version> | |
| </dependency> | 
NewerOlder