Skip to content

Instantly share code, notes, and snippets.

@mcadhoc
Forked from scrapehero/zillow.py
Last active May 27, 2020 19:21
Show Gist options
  • Save mcadhoc/efb31d0d41676bb0feff114ba67ba000 to your computer and use it in GitHub Desktop.
Save mcadhoc/efb31d0d41676bb0feff114ba67ba000 to your computer and use it in GitHub Desktop.

Revisions

  1. @ikicker ikicker revised this gist May 27, 2020. 1 changed file with 6 additions and 2 deletions.
    8 changes: 6 additions & 2 deletions zillow.py
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,7 @@
    import unicodecsv as csv
    import argparse
    import json
    from urllib.request import Request, urlopen


    def clean(text):
    @@ -114,12 +115,15 @@ def get_data_from_json(raw_json_data):
    def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)

    if not response:
    print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
    return None

    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    parser = html.fromstring(response.text)
    parser = html.fromstring(webpage)
    search_results = parser.xpath("//div[@id='search-results']//article")

    if not search_results:
  2. @scrapehero scrapehero revised this gist May 16, 2019. 1 changed file with 178 additions and 80 deletions.
    258 changes: 178 additions & 80 deletions zillow.py
    Original file line number Diff line number Diff line change
    @@ -2,89 +2,187 @@
    import requests
    import unicodecsv as csv
    import argparse
    import json

    def parse(zipcode,filter=None):

    if filter=="newest":
    url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
    url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
    url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)

    for i in range(5):
    # try:
    headers= {
    'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding':'gzip, deflate, sdch, br',
    'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
    'cache-control':'max-age=0',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    print(response.status_code)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []

    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
    raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
    raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
    raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
    raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
    url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
    raw_title = properties.xpath(".//h4//text()")

    address = ' '.join(' '.join(raw_address).split()) if raw_address else None
    city = ''.join(raw_city).strip() if raw_city else None
    state = ''.join(raw_state).strip() if raw_state else None
    postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
    price = ''.join(raw_price).strip() if raw_price else None
    info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
    broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
    title = ''.join(raw_title) if raw_title else None
    property_url = "https://www.zillow.com"+url[0] if url else None
    is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
    properties = {
    'address':address,
    'city':city,
    'state':state,
    'postal_code':postal_code,
    'price':price,
    'facts and features':info,
    'real estate provider':broker,
    'url':property_url,
    'title':title
    }
    if is_forsale:
    properties_list.append(properties)
    return properties_list
    # except:
    # print ("Failed to process the page",url)

    if __name__=="__main__":
    argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    argparser.add_argument('zipcode',help = '')
    sortorder_help = """

    def clean(text):
    if text:
    return ' '.join(' '.join(text).split())
    return None


    def get_headers():
    # Creating headers.
    headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, sdch, br',
    'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
    'cache-control': 'max-age=0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    return headers


    def create_url(zipcode, filter):
    # Creating Zillow URL based on the filter.

    if filter == "newest":
    url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
    url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
    url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
    print(url)
    return url


    def save_to_file(response):
    # saving response to `response.html`

    with open("response.html", 'w') as fp:
    fp.write(response.text)


    def write_data_to_csv(data):
    # saving scraped data to csv.

    with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
    fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in data:
    writer.writerow(row)


    def get_response(url):
    # Getting response from zillow.com.

    for i in range(5):
    response = requests.get(url, headers=get_headers())
    print("status code received:", response.status_code)
    if response.status_code != 200:
    # saving response to file for debugging purpose.
    save_to_file(response)
    continue
    else:
    save_to_file(response)
    return response
    return None

    def get_data_from_json(raw_json_data):
    # getting data from json (type 2 of their A/B testing page)

    cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
    properties_list = []

    try:
    json_data = json.loads(cleaned_data)
    search_results = json_data.get('searchResults').get('listResults', [])

    for properties in search_results:
    address = properties.get('addressWithZip')
    property_info = properties.get('hdpData', {}).get('homeInfo')
    city = property_info.get('city')
    state = property_info.get('state')
    postal_code = property_info.get('zipcode')
    price = properties.get('price')
    bedrooms = properties.get('beds')
    bathrooms = properties.get('baths')
    area = properties.get('area')
    info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
    broker = properties.get('brokerName')
    property_url = properties.get('detailUrl')
    title = properties.get('statusText')

    data = {'address': address,
    'city': city,
    'state': state,
    'postal_code': postal_code,
    'price': price,
    'facts and features': info,
    'real estate provider': broker,
    'url': property_url,
    'title': title}
    properties_list.append(data)

    return properties_list

    except ValueError:
    print("Invalid json")
    return None


    def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)

    if not response:
    print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
    return None

    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")

    if not search_results:
    print("parsing from json data")
    # identified as type 2 page
    raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
    return get_data_from_json(raw_json_data)

    print("parsing from html page")
    properties_list = []
    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
    raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
    raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
    raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
    raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
    url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
    raw_title = properties.xpath(".//h4//text()")

    address = clean(raw_address)
    city = clean(raw_city)
    state = clean(raw_state)
    postal_code = clean(raw_postal_code)
    price = clean(raw_price)
    info = clean(raw_info).replace(u"\xb7", ',')
    broker = clean(raw_broker_name)
    title = clean(raw_title)
    property_url = "https://www.zillow.com" + url[0] if url else None
    is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')

    properties = {'address': address,
    'city': city,
    'state': state,
    'postal_code': postal_code,
    'price': price,
    'facts and features': info,
    'real estate provider': broker,
    'url': property_url,
    'title': title}
    if is_forsale:
    properties_list.append(properties)
    return properties_list


    if __name__ == "__main__":
    # Reading arguments

    argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    argparser.add_argument('zipcode', help='')
    sortorder_help = """
    available sort orders are :
    newest : Latest property details,
    cheapest : Properties with cheapest price
    """
    argparser.add_argument('sort',nargs='?',help = sortorder_help,default ='Homes For You')
    args = argparser.parse_args()
    zipcode = args.zipcode
    sort = args.sort
    print ("Fetching data for %s"%(zipcode))
    scraped_data = parse(zipcode,sort)
    print ("Writing data to output file")
    with open("properties-%s.csv"%(zipcode),'wb')as csvfile:
    fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in scraped_data:
    writer.writerow(row)

    argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You')
    args = argparser.parse_args()
    zipcode = args.zipcode
    sort = args.sort
    print ("Fetching data for %s" % (zipcode))
    scraped_data = parse(zipcode, sort)
    if scraped_data:
    print ("Writing data to output file")
    write_data_to_csv(scraped_data)
  3. @scrapehero scrapehero revised this gist Feb 9, 2018. 1 changed file with 54 additions and 55 deletions.
    109 changes: 54 additions & 55 deletions zillow.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,6 @@
    from lxml import html
    import requests
    import unicodecsv as csv
    from exceptions import ValueError
    import argparse

    def parse(zipcode,filter=None):
    @@ -14,57 +13,58 @@ def parse(zipcode,filter=None):
    url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)

    for i in range(5):
    try:
    headers= {
    'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding':'gzip, deflate, sdch, br',
    'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
    'cache-control':'max-age=0',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url, headers=headers, verify=False)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []
    # try:
    headers= {
    'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding':'gzip, deflate, sdch, br',
    'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
    'cache-control':'max-age=0',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    print(response.status_code)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []

    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
    raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
    raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
    raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
    raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
    url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
    raw_title = properties.xpath(".//h4//text()")

    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
    raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
    raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
    raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
    raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
    url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
    raw_title = properties.xpath(".//h4//text()")

    address = ' '.join(' '.join(raw_address).split()) if raw_address else None
    city = ''.join(raw_city).strip() if raw_city else None
    state = ''.join(raw_state).strip() if raw_state else None
    postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
    price = ''.join(raw_price).strip() if raw_price else None
    info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
    broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
    title = ''.join(raw_title) if raw_title else None
    property_url = "https://www.zillow.com"+url[0] if url else None
    is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
    properties = {
    'address':address,
    'city':city,
    'state':state,
    'postal_code':postal_code,
    'price':price,
    'facts and features':info,
    'real estate provider':broker,
    'url':property_url,
    'title':title
    }
    if is_forsale:
    properties_list.append(properties)
    return properties_list
    except:
    print "Failed to process the page",url
    address = ' '.join(' '.join(raw_address).split()) if raw_address else None
    city = ''.join(raw_city).strip() if raw_city else None
    state = ''.join(raw_state).strip() if raw_state else None
    postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
    price = ''.join(raw_price).strip() if raw_price else None
    info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
    broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
    title = ''.join(raw_title) if raw_title else None
    property_url = "https://www.zillow.com"+url[0] if url else None
    is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
    properties = {
    'address':address,
    'city':city,
    'state':state,
    'postal_code':postal_code,
    'price':price,
    'facts and features':info,
    'real estate provider':broker,
    'url':property_url,
    'title':title
    }
    if is_forsale:
    properties_list.append(properties)
    return properties_list
    # except:
    # print ("Failed to process the page",url)

    if __name__=="__main__":
    argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    @@ -78,11 +78,10 @@ def parse(zipcode,filter=None):
    args = argparser.parse_args()
    zipcode = args.zipcode
    sort = args.sort
    print "Fetching data for %s"%(zipcode)
    print ("Fetching data for %s"%(zipcode))
    scraped_data = parse(zipcode,sort)
    print "Writing data to output file"

    with open("properties-%s.csv"%(zipcode),'w')as csvfile:
    print ("Writing data to output file")
    with open("properties-%s.csv"%(zipcode),'wb')as csvfile:
    fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
  4. @scrapehero scrapehero revised this gist Jan 12, 2018. No changes.
  5. @scrapehero scrapehero revised this gist Jan 12, 2018. 1 changed file with 5 additions and 2 deletions.
    7 changes: 5 additions & 2 deletions zillow.py
    Original file line number Diff line number Diff line change
    @@ -23,10 +23,11 @@ def parse(zipcode,filter=None):
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response = requests.get(url, headers=headers, verify=False)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []

    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    @@ -80,9 +81,11 @@ def parse(zipcode,filter=None):
    print "Fetching data for %s"%(zipcode)
    scraped_data = parse(zipcode,sort)
    print "Writing data to output file"

    with open("properties-%s.csv"%(zipcode),'w')as csvfile:
    fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in scraped_data:
    writer.writerow(row)
    writer.writerow(row)

  6. @scrapehero scrapehero revised this gist Sep 28, 2017. 1 changed file with 10 additions and 4 deletions.
    14 changes: 10 additions & 4 deletions zillow.py
    Original file line number Diff line number Diff line change
    @@ -15,11 +15,18 @@ def parse(zipcode,filter=None):

    for i in range(5):
    try:
    response = requests.get(url)
    headers= {
    'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'accept-encoding':'gzip, deflate, sdch, br',
    'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
    'cache-control':'max-age=0',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []

    for properties in search_results:
    raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
    raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
    @@ -78,5 +85,4 @@ def parse(zipcode,filter=None):
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in scraped_data:
    writer.writerow(row)

    writer.writerow(row)
  7. @scrapehero scrapehero revised this gist Aug 18, 2017. No changes.
  8. @scrapehero scrapehero revised this gist Aug 17, 2017. No changes.
  9. @scrapehero scrapehero revised this gist Aug 16, 2017. No changes.
  10. @scrapehero scrapehero revised this gist Aug 15, 2017. No changes.
  11. @scrapehero scrapehero revised this gist Aug 14, 2017. No changes.
  12. @scrapehero scrapehero revised this gist Aug 13, 2017. No changes.
  13. @scrapehero scrapehero revised this gist Aug 12, 2017. No changes.
  14. @scrapehero scrapehero revised this gist Aug 11, 2017. No changes.
  15. @scrapehero scrapehero revised this gist Aug 10, 2017. No changes.
  16. @scrapehero scrapehero revised this gist Aug 9, 2017. No changes.
  17. @scrapehero scrapehero revised this gist Aug 8, 2017. No changes.
  18. @scrapehero scrapehero revised this gist Aug 7, 2017. No changes.
  19. @scrapehero scrapehero revised this gist Aug 5, 2017. No changes.
  20. @scrapehero scrapehero revised this gist Aug 3, 2017. No changes.
  21. @scrapehero scrapehero revised this gist Aug 2, 2017. No changes.
  22. @scrapehero scrapehero revised this gist Aug 1, 2017. No changes.
  23. @scrapehero scrapehero revised this gist Jul 31, 2017. No changes.
  24. @scrapehero scrapehero revised this gist Jul 30, 2017. No changes.
  25. @scrapehero scrapehero revised this gist Jul 29, 2017. No changes.
  26. @scrapehero scrapehero revised this gist Jul 27, 2017. No changes.
  27. @scrapehero scrapehero revised this gist Jul 26, 2017. No changes.
  28. @scrapehero scrapehero revised this gist Jul 25, 2017. No changes.
  29. @scrapehero scrapehero revised this gist Jul 22, 2017. No changes.
  30. @scrapehero scrapehero revised this gist Jul 21, 2017. No changes.