Created
August 6, 2024 10:57
-
-
Save AnastasiyaByelyakova/6d9d3ee5ea206b7e4abb36a9c282a5db to your computer and use it in GitHub Desktop.
#selenium #firefox #openai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # from webdriver_manager.firefox import GeckoDriverManager | |
| # from selenium.webdriver.chrome.service import Service | |
| # def main(data): | |
| # service = Service(executable_path=GeckoDriverManager().install()) | |
| # # Set path to firefox binary | |
| # opt = webdriver.FirefoxOptions() | |
| # opt.binary_location = "/usr/bin/firefox" | |
| # driver = webdriver.Chrome() | |
| # # Set webdriver path | |
| # with open('acceptance_rates_ai.json', 'r') as jh: | |
| # results = json.load(jh) | |
| # processed_urls = [i['url'] for i in results ] | |
| # for item in tqdm(data): | |
| # try: | |
| # url = item["Source (Manually checked) "].split(", ")[0].strip() | |
| # except: | |
| # continue | |
| # if not url or 'niche' in url or 'usnews' in url: | |
| # continue | |
| # if url in processed_urls: | |
| # continue | |
| # print([url,item["Source (Manually checked) "] ]) | |
| # driver.get(url) | |
| # html = driver.page_source | |
| # text = clean_html(html) | |
| # if text: | |
| # res = get_from_openai(text, prompt.format(item["College Names "]), schema) | |
| # results.append({ | |
| # "url": url, | |
| # "college": item["College Names "], | |
| # "id": item["ID"], | |
| # "html": html, | |
| # "text": text, | |
| # "data": res | |
| # }) | |
| # with open("acceptance_rates_ai.json", "w") as jh: | |
| # json.dump(results, jh, indent=2) # Pretty-printed JSON | |
| # return results | |
| # main(data.to_dict(orient='records')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment