Skip to content

Instantly share code, notes, and snippets.

@AnastasiyaByelyakova
Created August 6, 2024 10:57
Show Gist options
  • Select an option

  • Save AnastasiyaByelyakova/6d9d3ee5ea206b7e4abb36a9c282a5db to your computer and use it in GitHub Desktop.

Select an option

Save AnastasiyaByelyakova/6d9d3ee5ea206b7e4abb36a9c282a5db to your computer and use it in GitHub Desktop.

Revisions

  1. AnastasiyaByelyakova created this gist Aug 6, 2024.
    51 changes: 51 additions & 0 deletions selenium_firefox.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,51 @@
    # from webdriver_manager.firefox import GeckoDriverManager
    # from selenium.webdriver.chrome.service import Service

    # def main(data):


    # service = Service(executable_path=GeckoDriverManager().install())
    # # Set path to firefox binary
    # opt = webdriver.FirefoxOptions()
    # opt.binary_location = "/usr/bin/firefox"
    # driver = webdriver.Chrome()
    # # Set webdriver path


    # with open('acceptance_rates_ai.json', 'r') as jh:
    # results = json.load(jh)
    # processed_urls = [i['url'] for i in results ]

    # for item in tqdm(data):
    # try:
    # url = item["Source (Manually checked) "].split(", ")[0].strip()
    # except:
    # continue
    # if not url or 'niche' in url or 'usnews' in url:
    # continue
    # if url in processed_urls:
    # continue
    # print([url,item["Source (Manually checked) "] ])
    # driver.get(url)
    # html = driver.page_source

    # text = clean_html(html)

    # if text:
    # res = get_from_openai(text, prompt.format(item["College Names "]), schema)

    # results.append({
    # "url": url,
    # "college": item["College Names "],
    # "id": item["ID"],
    # "html": html,
    # "text": text,
    # "data": res
    # })

    # with open("acceptance_rates_ai.json", "w") as jh:
    # json.dump(results, jh, indent=2) # Pretty-printed JSON

    # return results

    # main(data.to_dict(orient='records'))