Skip to content

Instantly share code, notes, and snippets.

@dheepakg
Created January 23, 2018 06:40
Show Gist options
  • Select an option

  • Save dheepakg/5d25cceb34c62344b985a154903d606e to your computer and use it in GitHub Desktop.

Select an option

Save dheepakg/5d25cceb34c62344b985a154903d606e to your computer and use it in GitHub Desktop.
wikiScrap
from bs4 import BeautifulSoup
import requests
#beautifulSoup('AR_Rahman.html',"html_parser")
#print(soup.Title)
#url = 'https://en.wikipedia.org/wiki/A._R._Rahman'
url = 'https://en.wikipedia.org/wiki/Bombay_(soundtrack)'
#url = 'https://assets.digitalocean.com/articles/eng_python/beautiful-soup/mockturtle.html'
page = requests.get(url)
print(page.status_code)
soup = BeautifulSoup(page.text,"html.parser")
html_file = open('AR_Rahman.html','w')
html_file.write(soup.prettify())
#print(soup.find_all('p'))
table_list = soup.find_all('td')
#print("here 11\n",len(table_list),type(table_list))
song_list = soup.findAll('td', style="vertical-align:top")
#print("here 22\n",len(song_list),type(song_list))
for row_num,element in enumerate(song_list):
print("row_num",row_num,"first if",element)
if row_num:# in (10,11) :
print("row_num",row_num,"second if", element)
element_list = list(element)
if str(element_list)[4]+str(element_list)[5]+str(element_list)[6]+ str(element_list)[7] != 'href':
print("row_num",row_num,"third if", element)
abc = list(element_list)[0]
if abc[0] == '"':
print("row_num",row_num,"4th if", element)
print("row_num",row_num,"Song with quotes", abc)
from bs4 import BeautifulSoup
import requests
#beautifulSoup('AR_Rahman.html',"html_parser")
#print(soup.Title)
#url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
#url = 'https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3'
url = 'https://assets.digitalocean.com/articles/eng_python/beautiful-soup/mockturtle.html'
page = requests.get(url)
print(page.status_code)
soup = BeautifulSoup(page.text,"html.parser")
print(soup.find_all('p'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment