from bs4 import BeautifulSoup import requests import regex def get(url): mycontent = requests.get(url) soup = BeautifulSoup(mycontent.text, "html.parser") return soup # the links we want to visit my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"] # the links that we don't want to visit twice already_visited = [] # while we still have links to visit, continue while len(my_list_of_links) > 0: # get the first element of the list and remove it from the list current_link = my_list_of_links.pop(0) # add the link to the "already_visited" list, to avoid visiting it twice already_visited.append(current_link) # get the html content from the link (URL) html = get(current_link) # for tomorrow, find a way to fill all these variables from the "html" object # I recommend that you check out the BeautifulSoup documentation to find what you need # you also need to read the html content to find the right html tags to extract url = title = content = writing_time = author = crawling_time = links = keywords = base_path = regex.match(r'^(.*/)[^/]*', current_link)[1] domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0] print("base_path", base_path) print("domain_name", domain_name) # to put everything in a database, we need to: # 1) connect to the db at the top of the script # 2) insert the values in this loop each time # 3) don't forget to commit after the execute # retrieve all the tags to get new links all_a_tags = html.find_all('a') for tag in all_a_tags: link = tag.get('href') # should look like https://www.nytimes.com # only add the link if it's not empty and if it's from the same website # for now, this condition is not valid for relative links # we need to add an option that validates relative links => use regular expressions # 1) match the link against the regular expression r'(([a-z]+)://([^/]*))?(.*)' match = regex.match(r'(([a-z]+)://([^/]*))?(.*)', link) # recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path if match and not match[1] and match[4]: # if something matches but only the path, not the rest # it is a relative link if match[4][0] == '/': # it's relative to the root of the webserver link = domain_name + link else: # it's relative to our current path # we need to get the base path of our current URL link = base_path + link print("link", link) # 2) if everything None except the path (match[4]), it's a relative link # 3) if it's a relative link, do what is on the board if link is not None and "https://www.nytimes.com" in link: # only add the link if we didn't already visit it if link not in already_visited: # find the right condition here my_list_of_links.append(link) print("number of visited links", len(already_visited)) print("number of links", len(my_list_of_links))