# Youtube video: https://youtu.be/dwCe9l_geK4 import requests import json import os import unicodedata from bs4 import BeautifulSoup urls = ['https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/moonridge-cali-bear-cabin/', 'https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/switzerland', "https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/pines" ] # Remove the trailing slash if present urls = [url.rstrip('/') for url in urls] for url in urls: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Extract the content within the HTML paragraph tags paragraphs = soup.select('#node-vr-listing-full-group-vr-property-desc p, #node-vr-listing-full-group-vr-property-desc ul li') content = ''.join([paragraph.get_text() + ' ' for paragraph in paragraphs]) # Remove special Unicode characters content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode() # Extract the file name from the URL file_name = os.path.basename(url) # Create a dictionary with the data data = {'content': content} # Save the data to a .json file with open(file_name + '.json', 'w') as outfile: json.dump(data, outfile, separators=(',', ':'), indent=2)