neshpat · March 7, 2023 11:56 · Mar 7, 2023 · Mar 6, 2023
diff --git a/web_scraping.py b/web_scraping.py
@@ -1,3 +1,4 @@
+# Youtube video: https://youtu.be/dwCe9l_geK4
 import requests
 import json
 import os

diff --git a/web_scraping.py b/web_scraping.py
@@ -0,0 +1,34 @@
+import requests
+import json
+import os
+import unicodedata
+from bs4 import BeautifulSoup
+
+urls = ['https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/moonridge-cali-bear-cabin/', 
+       'https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/switzerland', 
+       "https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/pines"
+       ]
+
+# Remove the trailing slash if present
+urls = [url.rstrip('/') for url in urls]
+
+for url in urls:
+  response = requests.get(url)
+  soup = BeautifulSoup(response.text, 'html.parser')
+
+  # Extract the content within the HTML paragraph tags
+  paragraphs = soup.select('#node-vr-listing-full-group-vr-property-desc p, #node-vr-listing-full-group-vr-property-desc ul li')
+  content = ''.join([paragraph.get_text() + ' ' for paragraph in paragraphs])
+
+  # Remove special Unicode characters
+  content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode()
+
+  # Extract the file name from the URL
+  file_name = os.path.basename(url)
+
+  # Create a dictionary with the data
+  data = {'content': content}
+
+  # Save the data to a .json file
+  with open(file_name + '.json', 'w') as outfile:
+    json.dump(data, outfile, separators=(',', ':'), indent=2)
No results found