Created
March 29, 2020 05:20
-
-
Save carlosmoran97/4ea8d030d5efa3c48adba2d6c4129a4b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import requests, lxml | |
| from bs4 import BeautifulSoup | |
| url = "https://www.tse.gob.sv/2018/escrutinio-final/asamblea/index.html" | |
| html = requests.get(url).text | |
| soup = BeautifulSoup(html, 'lxml') | |
| BASE_URL = "https://www.tse.gob.sv/2018/escrutinio-final/asamblea" | |
| panels = soup.findAll("div", {"class": "panel-purple"}) | |
| deptos_urls = [] | |
| def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, | |
| truncate_sheet=False, | |
| **to_excel_kwargs): | |
| """ | |
| Append a DataFrame [df] to existing Excel file [filename] | |
| into [sheet_name] Sheet. | |
| If [filename] doesn't exist, then this function will create it. | |
| Parameters: | |
| filename : File path or existing ExcelWriter | |
| (Example: '/path/to/file.xlsx') | |
| df : dataframe to save to workbook | |
| sheet_name : Name of sheet which will contain DataFrame. | |
| (default: 'Sheet1') | |
| startrow : upper left cell row to dump data frame. | |
| Per default (startrow=None) calculate the last row | |
| in the existing DF and write to the next row... | |
| truncate_sheet : truncate (remove and recreate) [sheet_name] | |
| before writing DataFrame to Excel file | |
| to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()` | |
| [can be dictionary] | |
| Returns: None | |
| """ | |
| from openpyxl import load_workbook | |
| import pandas as pd | |
| # ignore [engine] parameter if it was passed | |
| if 'engine' in to_excel_kwargs: | |
| to_excel_kwargs.pop('engine') | |
| writer = pd.ExcelWriter(filename, engine='openpyxl') | |
| # Python 2.x: define [FileNotFoundError] exception if it doesn't exist | |
| try: | |
| FileNotFoundError | |
| except NameError: | |
| FileNotFoundError = IOError | |
| try: | |
| # try to open an existing workbook | |
| writer.book = load_workbook(filename) | |
| # get the last row in the existing Excel sheet | |
| # if it was not specified explicitly | |
| if startrow is None and sheet_name in writer.book.sheetnames: | |
| startrow = writer.book[sheet_name].max_row | |
| # truncate sheet | |
| if truncate_sheet and sheet_name in writer.book.sheetnames: | |
| # index of [sheet_name] sheet | |
| idx = writer.book.sheetnames.index(sheet_name) | |
| # remove [sheet_name] | |
| writer.book.remove(writer.book.worksheets[idx]) | |
| # create an empty sheet [sheet_name] using old index | |
| writer.book.create_sheet(sheet_name, idx) | |
| # copy existing sheets | |
| writer.sheets = {ws.title:ws for ws in writer.book.worksheets} | |
| except FileNotFoundError: | |
| # file does not exist yet, we will create it | |
| pass | |
| if startrow is None: | |
| startrow = 0 | |
| # write out the new sheet | |
| df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs) | |
| # save the workbook | |
| writer.save() | |
| for panel in panels: | |
| deptos_urls.append(str(panel.findAll('a')[1]['href'])) | |
| for panel in panels: | |
| df = pd.concat(pd.read_html(str(panel))) | |
| append_df_to_excel("data.xlsx", df, "DEP - " + str(panel.findAll('span')[1].text.split(" (")[0])) | |
| for url in deptos_urls: | |
| html = requests.get("{}/{}".format(BASE_URL, url)).text | |
| soup = BeautifulSoup(html, 'lxml') | |
| panels = soup.findAll("div", {"class": "panel-purple"}) | |
| for panel in panels: | |
| #print() | |
| df = pd.concat(pd.read_html(str(panel))) | |
| append_df_to_excel("data.xlsx", df, "MUN - " + str(panel.findAll('span')[1].text.split(" (")[0])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment