Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save KTZgraph/7d70d05e8a1ab0cf80dbc997721c325d to your computer and use it in GitHub Desktop.

Select an option

Save KTZgraph/7d70d05e8a1ab0cf80dbc997721c325d to your computer and use it in GitHub Desktop.

Revisions

  1. @datahutrepo datahutrepo created this gist Dec 6, 2016.
    133 changes: 133 additions & 0 deletions linkedin_companies_scraper
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,133 @@
    # -*- coding: utf-8 -*-
    import json
    import requests
    import re
    import random
    import urllib
    import lxml.html
    import bs4
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')


    HEADERS = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, sdch',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
    file = open('company_data.json', 'w')
    file.write('[')
    file.close()
    COUNT = 0

    def increment():
    global COUNT
    COUNT = COUNT+1

    def fetch_request(url):
    try:
    fetch_url = requests.get(url, headers=HEADERS)
    except:
    try:
    fetch_url = requests.get(url, headers=HEADERS)
    except:
    try:
    fetch_url = requests.get(url, headers=HEADERS)
    except:
    fetch_url = ''
    return fetch_url

    def parse_company_urls(company_url):

    if company_url:
    if '/company/' in company_url:
    parse_company_data(company_url)
    else:
    parent_url = company_url
    fetch_company_url=fetch_request(company_url)
    if fetch_company_url:
    sel = lxml.html.fromstring(fetch_company_url.content)
    COMPANIES_XPATH = '//div[@class="section last"]/div/ul/li/a/@href'
    companies_urls = sel.xpath(COMPANIES_XPATH)
    if companies_urls:
    if '/company/' in companies_urls[0]:
    print 'Parsing From Category ', parent_url
    print '-------------------------------------------------------------------------------------'
    for company_url in companies_urls:
    parse_company_urls(company_url)
    else:
    pass


    def parse_company_data(company_data_url):

    if company_data_url:
    fetch_company_data = fetch_request(company_data_url)
    if fetch_company_data.status_code == 200:
    try:
    source = fetch_company_data.content.decode('utf-8')
    sel = lxml.html.fromstring(source)
    # CODE_XPATH = '//code[@id="stream-promo-top-bar-embed-id-content"]'
    # code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')
    code_text = sel.get_element_by_id(
    'stream-promo-top-bar-embed-id-content')
    if len(code_text) > 0:
    code_text = str(code_text[0])
    code_text = re.findall(r'<!--(.*)-->', str(code_text))
    code_text = code_text[0].strip() if code_text else '{}'
    json_data = json.loads(code_text)
    if json_data.get('squareLogo', ''):
    company_pic = 'https://media.licdn.com/mpr/mpr/shrink_200_200' + \
    json_data.get('squareLogo', '')
    elif json_data.get('legacyLogo', ''):
    company_pic = 'https://media.licdn.com/media' + \
    json_data.get('legacyLogo', '')
    else:
    company_pic = ''
    company_name = json_data.get('companyName', '')
    followers = str(json_data.get('followerCount', ''))

    # CODE_XPATH = '//code[@id="stream-about-section-embed-id-content"]'
    # code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')
    code_text = sel.get_element_by_id(
    'stream-about-section-embed-id-content')
    if len(code_text) > 0:
    code_text = str(code_text[0]).encode('utf-8')
    code_text = re.findall(r'<!--(.*)-->', str(code_text))
    code_text = code_text[0].strip() if code_text else '{}'
    json_data = json.loads(code_text)
    company_industry = json_data.get('industry', '')
    item = {'company_name': str(company_name.encode('utf-8')),
    'followers': str(followers),
    'company_industry': str(company_industry.encode('utf-8')),
    'logo_url': str(company_pic),
    'url': str(company_data_url.encode('utf-8')), }
    increment()

    print item

    file = open('company_data.json', 'a')
    file.write(str(item)+',\n')
    file.close()
    except:
    pass
    else:
    pass


    fetch_company_dir = fetch_request('https://www.linkedin.com/directory/companies/')
    if fetch_company_dir:
    print 'Starting Company Url Scraping'
    print '-----------------------------'
    sel = lxml.html.fromstring(fetch_company_dir.content)
    SUB_PAGES_XPATH = '//div[@class="bucket-list-container"]/ol/li/a/@href'
    sub_pages = sel.xpath(SUB_PAGES_XPATH)
    print 'Company Category URL list'
    print '--------------------------'
    print sub_pages
    if sub_pages:
    for sub_page in sub_pages:
    parse_company_urls(sub_page)
    else:
    pass