""" Original code is from pythoncircle.com https://www.pythoncircle.com/post/522/python-script-7-scraping-tweets-using-beautifulsoup/#new_comment_522 Modern python is good to go with Python 3.* Python 2.7 is nearing End of Life soon. Error Observed "bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library? " Solution:- pip install lxml 1. Tested with Python 3.6.8 Along with packages of requests, beautifulsoup4, bs4 another package 'lxml' is required. 2. Overall 4 packages are required in 'pip install' beautifulsoup4==4.8.1 bs4==0.0.1 requests==2.22.0 lxml==4.4.1 """ # script to scrap tweets by a twitter user. # Author - ThePythonDjango.Com # dependencies - BeautifulSoup, requests from bs4 import BeautifulSoup import requests import sys import json def usage(): msg = """ Please use the below command to use the script. python script_name.py twitter_username """ print(msg) sys.exit(1) def get_tweet_text(tweet): tweet_text_box = tweet.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}) images_in_tweet_tag = tweet_text_box.find_all("a", {"class": "twitter-timeline-link u-hidden"}) tweet_text = tweet_text_box.text for image_in_tweet_tag in images_in_tweet_tag: tweet_text = tweet_text.replace(image_in_tweet_tag.text, '') return tweet_text def get_this_page_tweets(soup): tweets_list = list() tweets = soup.find_all("li", {"data-item-type": "tweet"}) for tweet in tweets: tweet_data = None try: tweet_data = get_tweet_text(tweet) except Exception as e: continue #ignore if there is any loading or tweet error if tweet_data: tweets_list.append(tweet_data) print(".", end="") sys.stdout.flush() return tweets_list def get_tweets_data(username, soup): tweets_list = list() tweets_list.extend(get_this_page_tweets(soup)) next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"] while True: next_url = "https://twitter.com/i/profiles/show/" + username + \ "/timeline/tweets?include_available_features=1&" \ "include_entities=1&max_position=" + next_pointer + "&reset_error_state=false" next_response = None try: next_response = requests.get(next_url) except Exception as e: # in case there is some issue with request. None encountered so far. print(e) return tweets_list tweets_data = next_response.text tweets_obj = json.loads(tweets_data) if not tweets_obj["has_more_items"] and not tweets_obj["min_position"]: # using two checks here bcz in one case has_more_items was false but there were more items print("\nNo more tweets returned") break next_pointer = tweets_obj["min_position"] html = tweets_obj["items_html"] soup = BeautifulSoup(html, 'lxml') tweets_list.extend(get_this_page_tweets(soup)) return tweets_list # dump final result in a json file def dump_data(username, tweets): filename = username+"_twitter.json" print("\nDumping data in file " + filename) data = dict() data["tweets"] = tweets with open(filename, 'w') as fh: fh.write(json.dumps(data)) return filename def get_username(): # if username is not passed if len(sys.argv) < 2: usage() username = sys.argv[1].strip().lower() if not username: usage() return username def start(username = None): username = get_username() url = "http://www.twitter.com/" + username print("\n\nDownloading tweets for " + username) response = None try: response = requests.get(url) except Exception as e: print(repr(e)) sys.exit(1) if response.status_code != 200: print("Non success status code returned "+str(response.status_code)) sys.exit(1) soup = BeautifulSoup(response.text, 'lxml') if soup.find("div", {"class": "errorpage-topbar"}): print("\n\n Error: Invalid username.") sys.exit(1) tweets = get_tweets_data(username, soup) # dump data in a text file dump_data(username, tweets) print(str(len(tweets))+" tweets dumped.") start()