-
-
Save neshpat/4720ebbd24871be5bbb4ae6d6ea67430 to your computer and use it in GitHub Desktop.
Revisions
-
thatguynef revised this gist
Mar 7, 2023 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,4 @@ # Youtube video: https://youtu.be/dwCe9l_geK4 import requests import json import os -
thatguynef created this gist
Mar 6, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,34 @@ import requests import json import os import unicodedata from bs4 import BeautifulSoup urls = ['https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/moonridge-cali-bear-cabin/', 'https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/switzerland', "https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/pines" ] # Remove the trailing slash if present urls = [url.rstrip('/') for url in urls] for url in urls: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Extract the content within the HTML paragraph tags paragraphs = soup.select('#node-vr-listing-full-group-vr-property-desc p, #node-vr-listing-full-group-vr-property-desc ul li') content = ''.join([paragraph.get_text() + ' ' for paragraph in paragraphs]) # Remove special Unicode characters content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode() # Extract the file name from the URL file_name = os.path.basename(url) # Create a dictionary with the data data = {'content': content} # Save the data to a .json file with open(file_name + '.json', 'w') as outfile: json.dump(data, outfile, separators=(',', ':'), indent=2)