Created
August 6, 2024 10:57
-
-
Save AnastasiyaByelyakova/6d9d3ee5ea206b7e4abb36a9c282a5db to your computer and use it in GitHub Desktop.
Revisions
-
AnastasiyaByelyakova created this gist
Aug 6, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,51 @@ # from webdriver_manager.firefox import GeckoDriverManager # from selenium.webdriver.chrome.service import Service # def main(data): # service = Service(executable_path=GeckoDriverManager().install()) # # Set path to firefox binary # opt = webdriver.FirefoxOptions() # opt.binary_location = "/usr/bin/firefox" # driver = webdriver.Chrome() # # Set webdriver path # with open('acceptance_rates_ai.json', 'r') as jh: # results = json.load(jh) # processed_urls = [i['url'] for i in results ] # for item in tqdm(data): # try: # url = item["Source (Manually checked) "].split(", ")[0].strip() # except: # continue # if not url or 'niche' in url or 'usnews' in url: # continue # if url in processed_urls: # continue # print([url,item["Source (Manually checked) "] ]) # driver.get(url) # html = driver.page_source # text = clean_html(html) # if text: # res = get_from_openai(text, prompt.format(item["College Names "]), schema) # results.append({ # "url": url, # "college": item["College Names "], # "id": item["ID"], # "html": html, # "text": text, # "data": res # }) # with open("acceptance_rates_ai.json", "w") as jh: # json.dump(results, jh, indent=2) # Pretty-printed JSON # return results # main(data.to_dict(orient='records'))