Skip to content

Instantly share code, notes, and snippets.

@mdvsh
Created October 16, 2020 16:43
Show Gist options
  • Save mdvsh/11a4a5b14977d25f443d37ea005b6897 to your computer and use it in GitHub Desktop.
Save mdvsh/11a4a5b14977d25f443d37ea005b6897 to your computer and use it in GitHub Desktop.

Revisions

  1. mdvsh created this gist Oct 16, 2020.
    33 changes: 33 additions & 0 deletions mangla_learning.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,33 @@
    from bs4 import BeautifulSoup
    src = open('bhai.html', 'r')
    import re, csv, pandas
    soup = BeautifulSoup(src, 'lxml')
    data = {}
    table = soup.find("table", attrs={'class':'stripe'})
    table_head = table.thead.find_all("tr")
    heading = []
    for th in table_head[0].find_all("th"):
    heading.append(th.text.replace('\n', '').strip())
    table_data = []
    for tr in table.tbody.find_all("tr"):
    t_row = {}
    for td, th in zip(tr.find_all("td"), heading):
    t_row[th] = td.text.replace('\n', '').strip()
    table_data.append(t_row)
    # print(table_data[1])
    schools = []
    for row in table_data:
    if 'School' in row['Finalist Name(s)']:
    s = re.sub(' +', ' ', row['Finalist Name(s)'])
    schools.append(s[s.find("(")+1:s.find(")")][8:])
    # print(schools[7])
    # print(schools)

    df = pandas.DataFrame(schools)
    print(df.head)

    # with open('mangla_learning_schools.csv', 'w', newline="\n") as src:
    # writer = csv.writer(src, delimiter=',')
    # writer.writerow(schools)

    df.to_csv('mangla_learning.csv')