AnastasiyaByelyakova · August 6, 2024 10:57
diff --git a/selenium_firefox.py b/selenium_firefox.py
 # from webdriver_manager.firefox import GeckoDriverManager
 # from selenium.webdriver.chrome.service import Service
 
 # def main(data):

        
 #     service = Service(executable_path=GeckoDriverManager().install())
 #     # Set path to firefox binary
 #     opt = webdriver.FirefoxOptions()
 #     opt.binary_location = "/usr/bin/firefox"
 #     driver = webdriver.Chrome()
 #     # Set webdriver path

    
 #     with open('acceptance_rates_ai.json', 'r') as jh:
 #         results = json.load(jh)
 #         processed_urls = [i['url'] for i in results ]

 #     for item in tqdm(data):
 #         try:
 #             url = item["Source (Manually checked) "].split(", ")[0].strip()
 #         except:
 #             continue
 #         if not url or 'niche' in url or 'usnews' in url:
 #             continue
 #         if url in processed_urls:
 #             continue
 #         print([url,item["Source (Manually checked) "] ])
 #         driver.get(url)
 #         html = driver.page_source

 #         text = clean_html(html)

 #         if text:
 #             res = get_from_openai(text, prompt.format(item["College Names "]), schema)

 #             results.append({
 #                 "url": url,
 #                 "college": item["College Names "],
 #                 "id": item["ID"],
 #                 "html": html,
 #                 "text": text,
 #                 "data": res
 #             })

 #             with open("acceptance_rates_ai.json", "w") as jh:
 #                 json.dump(results, jh, indent=2)  # Pretty-printed JSON

 #     return results

 # main(data.to_dict(orient='records'))
	# from webdriver_manager.firefox import GeckoDriverManager
	# from selenium.webdriver.chrome.service import Service

	# def main(data):


	# service = Service(executable_path=GeckoDriverManager().install())
	# # Set path to firefox binary
	# opt = webdriver.FirefoxOptions()
	# opt.binary_location = "/usr/bin/firefox"
	# driver = webdriver.Chrome()
	# # Set webdriver path


	# with open('acceptance_rates_ai.json', 'r') as jh:
	# results = json.load(jh)
	# processed_urls = [i['url'] for i in results ]

	# for item in tqdm(data):
	# try:
	# url = item["Source (Manually checked) "].split(", ")[0].strip()
	# except:
	# continue
	# if not url or 'niche' in url or 'usnews' in url:
	# continue
	# if url in processed_urls:
	# continue
	# print([url,item["Source (Manually checked) "] ])
	# driver.get(url)
	# html = driver.page_source

	# text = clean_html(html)

	# if text:
	# res = get_from_openai(text, prompt.format(item["College Names "]), schema)

	# results.append({
	# "url": url,
	# "college": item["College Names "],
	# "id": item["ID"],
	# "html": html,
	# "text": text,
	# "data": res
	# })

	# with open("acceptance_rates_ai.json", "w") as jh:
	# json.dump(results, jh, indent=2) # Pretty-printed JSON

	# return results

	# main(data.to_dict(orient='records'))