-
-
Save mcadhoc/efb31d0d41676bb0feff114ba67ba000 to your computer and use it in GitHub Desktop.
Revisions
-
ikicker revised this gist
May 27, 2020 . 1 changed file with 6 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,6 +3,7 @@ import unicodecsv as csv import argparse import json from urllib.request import Request, urlopen def clean(text): @@ -114,12 +115,15 @@ def get_data_from_json(raw_json_data): def parse(zipcode, filter=None): url = create_url(zipcode, filter) response = get_response(url) if not response: print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") return None req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() parser = html.fromstring(webpage) search_results = parser.xpath("//div[@id='search-results']//article") if not search_results: -
scrapehero revised this gist
May 16, 2019 . 1 changed file with 178 additions and 80 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,89 +2,187 @@ import requests import unicodecsv as csv import argparse import json def clean(text): if text: return ' '.join(' '.join(text).split()) return None def get_headers(): # Creating headers. headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} return headers def create_url(zipcode, filter): # Creating Zillow URL based on the filter. if filter == "newest": url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) elif filter == "cheapest": url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) else: url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) print(url) return url def save_to_file(response): # saving response to `response.html` with open("response.html", 'w') as fp: fp.write(response.text) def write_data_to_csv(data): # saving scraped data to csv. with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in data: writer.writerow(row) def get_response(url): # Getting response from zillow.com. for i in range(5): response = requests.get(url, headers=get_headers()) print("status code received:", response.status_code) if response.status_code != 200: # saving response to file for debugging purpose. save_to_file(response) continue else: save_to_file(response) return response return None def get_data_from_json(raw_json_data): # getting data from json (type 2 of their A/B testing page) cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "") properties_list = [] try: json_data = json.loads(cleaned_data) search_results = json_data.get('searchResults').get('listResults', []) for properties in search_results: address = properties.get('addressWithZip') property_info = properties.get('hdpData', {}).get('homeInfo') city = property_info.get('city') state = property_info.get('state') postal_code = property_info.get('zipcode') price = properties.get('price') bedrooms = properties.get('beds') bathrooms = properties.get('baths') area = properties.get('area') info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft' broker = properties.get('brokerName') property_url = properties.get('detailUrl') title = properties.get('statusText') data = {'address': address, 'city': city, 'state': state, 'postal_code': postal_code, 'price': price, 'facts and features': info, 'real estate provider': broker, 'url': property_url, 'title': title} properties_list.append(data) return properties_list except ValueError: print("Invalid json") return None def parse(zipcode, filter=None): url = create_url(zipcode, filter) response = get_response(url) if not response: print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") return None parser = html.fromstring(response.text) search_results = parser.xpath("//div[@id='search-results']//article") if not search_results: print("parsing from json data") # identified as type 2 page raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()') return get_data_from_json(raw_json_data) print("parsing from html page") properties_list = [] for properties in search_results: raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") raw_title = properties.xpath(".//h4//text()") address = clean(raw_address) city = clean(raw_city) state = clean(raw_state) postal_code = clean(raw_postal_code) price = clean(raw_price) info = clean(raw_info).replace(u"\xb7", ',') broker = clean(raw_broker_name) title = clean(raw_title) property_url = "https://www.zillow.com" + url[0] if url else None is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') properties = {'address': address, 'city': city, 'state': state, 'postal_code': postal_code, 'price': price, 'facts and features': info, 'real estate provider': broker, 'url': property_url, 'title': title} if is_forsale: properties_list.append(properties) return properties_list if __name__ == "__main__": # Reading arguments argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) argparser.add_argument('zipcode', help='') sortorder_help = """ available sort orders are : newest : Latest property details, cheapest : Properties with cheapest price """ argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') args = argparser.parse_args() zipcode = args.zipcode sort = args.sort print ("Fetching data for %s" % (zipcode)) scraped_data = parse(zipcode, sort) if scraped_data: print ("Writing data to output file") write_data_to_csv(scraped_data) -
scrapehero revised this gist
Feb 9, 2018 . 1 changed file with 54 additions and 55 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,6 @@ from lxml import html import requests import unicodecsv as csv import argparse def parse(zipcode,filter=None): @@ -14,57 +13,58 @@ def parse(zipcode,filter=None): url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) for i in range(5): # try: headers= { 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding':'gzip, deflate, sdch, br', 'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', 'cache-control':'max-age=0', 'upgrade-insecure-requests':'1', 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } response = requests.get(url,headers=headers) print(response.status_code) parser = html.fromstring(response.text) search_results = parser.xpath("//div[@id='search-results']//article") properties_list = [] for properties in search_results: raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") raw_title = properties.xpath(".//h4//text()") address = ' '.join(' '.join(raw_address).split()) if raw_address else None city = ''.join(raw_city).strip() if raw_city else None state = ''.join(raw_state).strip() if raw_state else None postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None price = ''.join(raw_price).strip() if raw_price else None info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',') broker = ''.join(raw_broker_name).strip() if raw_broker_name else None title = ''.join(raw_title) if raw_title else None property_url = "https://www.zillow.com"+url[0] if url else None is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') properties = { 'address':address, 'city':city, 'state':state, 'postal_code':postal_code, 'price':price, 'facts and features':info, 'real estate provider':broker, 'url':property_url, 'title':title } if is_forsale: properties_list.append(properties) return properties_list # except: # print ("Failed to process the page",url) if __name__=="__main__": argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) @@ -78,11 +78,10 @@ def parse(zipcode,filter=None): args = argparser.parse_args() zipcode = args.zipcode sort = args.sort print ("Fetching data for %s"%(zipcode)) scraped_data = parse(zipcode,sort) print ("Writing data to output file") with open("properties-%s.csv"%(zipcode),'wb')as csvfile: fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() -
scrapehero revised this gist
Jan 12, 2018 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jan 12, 2018 . 1 changed file with 5 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -23,10 +23,11 @@ def parse(zipcode,filter=None): 'upgrade-insecure-requests':'1', 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } response = requests.get(url, headers=headers, verify=False) parser = html.fromstring(response.text) search_results = parser.xpath("//div[@id='search-results']//article") properties_list = [] for properties in search_results: raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") @@ -80,9 +81,11 @@ def parse(zipcode,filter=None): print "Fetching data for %s"%(zipcode) scraped_data = parse(zipcode,sort) print "Writing data to output file" with open("properties-%s.csv"%(zipcode),'w')as csvfile: fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in scraped_data: writer.writerow(row) -
scrapehero revised this gist
Sep 28, 2017 . 1 changed file with 10 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -15,11 +15,18 @@ def parse(zipcode,filter=None): for i in range(5): try: headers= { 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding':'gzip, deflate, sdch, br', 'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', 'cache-control':'max-age=0', 'upgrade-insecure-requests':'1', 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } response = requests.get(url,headers=headers) parser = html.fromstring(response.text) search_results = parser.xpath("//div[@id='search-results']//article") properties_list = [] for properties in search_results: raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") @@ -78,5 +85,4 @@ def parse(zipcode,filter=None): writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in scraped_data: writer.writerow(row) -
scrapehero revised this gist
Aug 18, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 17, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 16, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 15, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 14, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 13, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 12, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 11, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 10, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 9, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 8, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 7, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 5, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 3, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 2, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Aug 1, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 31, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 30, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 29, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 27, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 26, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 25, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 22, 2017 . No changes.There are no files selected for viewing
-
scrapehero revised this gist
Jul 21, 2017 . No changes.There are no files selected for viewing
NewerOlder