from bs4 import BeautifulSoup
import requests
import regex

def get(url):
    mycontent = requests.get(url)
    soup = BeautifulSoup(mycontent.text, "html.parser")
    return soup

# the links we want to visit
my_list_of_links = ["https://www.nytimes.com/2019/07/02/dining/ice-cream-shops.html"]
# the links that we don't want to visit twice
already_visited = []

# while we still have links to visit, continue
while len(my_list_of_links) > 0:
    # get the first element of the list and remove it from the list
    current_link = my_list_of_links.pop(0)
    # add the link to the "already_visited" list, to avoid visiting it twice
    already_visited.append(current_link)
    # get the html content from the link (URL)
    html = get(current_link)
    
    # for tomorrow, find a way to fill all these variables from the "html" object
    # I recommend that you check out the BeautifulSoup documentation to find what you need
    # you also need to read the html content to find the right html tags to extract
    url =
    title =
    content =
    writing_time =
    author =
    crawling_time =
    links =
    keywords =
    
    base_path = regex.match(r'^(.*/)[^/]*', current_link)[1]
    domain_name = regex.match(r'^[a-z]+://[^/]*', current_link)[0]
    
    print("base_path", base_path)
    print("domain_name", domain_name)
    # to put everything in a database, we need to:
    # 1) connect to the db at the top of the script
    # 2) insert the values in this loop each time
    # 3) don't forget to commit after the execute
    # retrieve all the <a> tags to get new links
    all_a_tags = html.find_all('a')
    for tag in all_a_tags:
        link = tag.get('href') # should look like https://www.nytimes.com
        # only add the link if it's not empty and if it's from the same website
        # for now, this condition is not valid for relative links
        # we need to add an option that validates relative links => use regular expressions
        # 1) match the link against the regular expression r'(([a-z]+)://([^/]*))?(.*)'
        match = regex.match(r'(([a-z]+)://([^/]*))?(.*)', link)
        # recall that match[0] is the whole string, match[2] is the protocol, match[3] is the domain name, match[4] is the path
        if match and not match[1] and match[4]: # if something matches but only the path, not the rest
            # it is a relative link
            if match[4][0] == '/': # it's relative to the root of the webserver
                link = domain_name + link
            else: # it's relative to our current path
                # we need to get the base path of our current URL
                link = base_path + link
        print("link", link) 
        # 2) if everything None except the path (match[4]), it's a relative link
        # 3) if it's a relative link, do what is on the board
        if link is not None and "https://www.nytimes.com" in link:
            # only add the link if we didn't already visit it
            if link not in already_visited: # find the right condition here
                my_list_of_links.append(link)
    
    print("number of visited links", len(already_visited))
    print("number of links", len(my_list_of_links))