Created
April 2, 2021 09:47
-
-
Save parth-paradkar/53625db1d1ec4fdd8bb1c642dfc00ad9 to your computer and use it in GitHub Desktop.
Scrape Facebook page for posts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium.webdriver import Firefox | |
| from os import getenv | |
| import time | |
| def refresh_feed_elements(driver: Firefox): | |
| main_feed = driver.find_elements_by_xpath(".//div[@role='feed']")[1] | |
| posts = main_feed.find_elements_by_xpath("./div") | |
| return main_feed, posts | |
| def login(driver: Firefox, email: str, password: str): | |
| driver.get("https://www.facebook.com/") | |
| driver.find_element_by_id("email").send_keys(email) | |
| driver.find_element_by_id("pass").send_keys(password) | |
| driver.find_element_by_name("login").click() | |
| def get_page(driver: Firefox, page: str): | |
| driver.get(page) | |
| time.sleep(3) | |
| def scrape_page(page: str): | |
| email = getenv("FB_EMAIL") | |
| password = getenv("FB_PASSWORD") | |
| driver = Firefox() | |
| login(driver, email, password) | |
| get_page(driver, page) | |
| main_feed = driver.find_elements_by_xpath(".//div[@role='feed']")[1] | |
| posts = main_feed.find_elements_by_xpath("./div") | |
| post_class_name = posts[1].get_attribute("class") | |
| num_processed_posts = 0 | |
| while len(posts) != num_processed_posts: | |
| for i in range(num_processed_posts, len(posts)): | |
| current_post = posts[i] | |
| try: | |
| current_post.find_element_by_xpath(".//div[text()[contains(., 'See More')]]").click() | |
| except: | |
| print("Exception! Post number:", i) | |
| pass | |
| process_post(current_post.text) | |
| num_processed_posts += 1 | |
| posts = main_feed.find_elements_by_xpath("./div") | |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(3) | |
| main_feed = driver.find_elements_by_xpath(".//div[@role='feed']")[1] | |
| posts = main_feed.find_elements_by_xpath("./div") | |
| print("Posts processed:", num_processed_posts) | |
| def process_post(post_text): | |
| split_text = post_text.split("\n") | |
| if(len(split_text) > 3): | |
| file_path = f"confessions/{split_text[3]}.txt" | |
| with open(file_path, "w") as f: | |
| f.write(post_text) | |
| print(f"Written to file {file_path}") | |
| if __name__ == "__main__": | |
| scrape_page("https://www.facebook.com/confesskgpee") |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was written to scrape the
KGP Confessionspage on Facebook.