Skip to content

Instantly share code, notes, and snippets.

@efazati
Created June 7, 2016 07:45
Show Gist options
  • Save efazati/fe35d84ee9d1f760f4b5230ef29609a8 to your computer and use it in GitHub Desktop.
Save efazati/fe35d84ee9d1f760f4b5230ef29609a8 to your computer and use it in GitHub Desktop.

Revisions

  1. efazati created this gist Jun 7, 2016.
    64 changes: 64 additions & 0 deletions simple_crawl.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    #-*- coding: utf-8 -*-

    from lxml import html
    import requests
    from pymongo import MongoClient
    from pprint import pprint
    import urllib
    import telepot
    from datetime import datetime

    client = MongoClient('mongodb://localhost:27017/')
    token = ''
    chat_id = '@...'
    url = "http://www.yjc.ir/fa/photo"
    db = client['telepy']
    article_obj = db.article

    element = ''
    def data_gathering():
    page = requests.get(url)
    tree = html.fromstring(page.content)
    element = tree
    articles = tree.find_class('ax_faal')
    result = []
    for article in articles:
    item = {}
    item['img'] = article.cssselect('img')[0].values()[-1]
    addr = article.cssselect('a')
    if addr:
    item['url'] = addr[0].values()[1]
    item['title'] = article.cssselect('.title_txt1')[0].text
    item['data'] = datetime.now()
    item['source'] = 'yjc'
    result.append(item)

    return result

    def submit_data(bot, row):
    if store_db(row):
    rawimg = urllib.urlopen(row['img'])
    print 'submit img url', row['img']
    print datetime.now()
    return bot.sendPhoto(chat_id, ('newsimage.jpg', rawimg), caption='%s - @axekhabar' % row['title'])
    return

    def store_db(row):
    article = article_obj.find_one({"img": row['img']})
    if not article:
    id = article_obj.insert_one(row).inserted_id
    return True
    return False

    def submit_alldata(data):
    bot = telepot.Bot(token)
    me = bot.getMe()
    # print me
    for row in data:
    submit_data(bot, row)
    # print row['title']


    print 'started', datetime.now()
    result = data_gathering()
    submit_alldata(result)