-
-
Save aytvill/0c53c9f7788f70453c51d9812b2ab615 to your computer and use it in GitHub Desktop.
Revisions
-
edemnati revised this gist
Aug 29, 2019 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -19,7 +19,6 @@ for i in range(0,len(p_blocks)): # 2.1 Loop trough paragraph parents to extract its element name and id parents_list=[] for parent in p_blocks[i].parents: -
edemnati revised this gist
Aug 29, 2019 . 1 changed file with 9 additions and 15 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,11 +16,10 @@ # 2. for each paragraph, construct its patents elements hierarchy #Create a dataframe to collect p_blocks data p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count']) for i in range(0,len(p_blocks)): # 2.1 Loop trough paragraph parents to extract its element name and id parents_list=[] for parent in p_blocks[i].parents: @@ -34,32 +33,27 @@ # Append the parent name and id to the parents table parents_list.append(parent.name + 'id: ' + Parent_id) # 2.2 Construct parents hierarchy parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ] parent_element_list.reverse() parent_hierarchy = ' -> '.join(parent_element_list) #Append data table with the current paragraph data p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name ,"parent_hierarchy":parent_hierarchy ,"element_text":p_blocks[i].text ,"element_text_Count":len(str(p_blocks[i].text))} ,ignore_index=True ,sort=False) # 3. concatenate paragraphs under the same parent hierarchy if len(p_blocks_df)>0: p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy']) p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum() p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True) # 4. count paragraphs length # 5. select the longest paragraph as the main article maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax() ,'parent_hierarchy'] merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list()) -
edemnati revised this gist
Aug 29, 2019 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -40,7 +40,7 @@ parent_hierarchy = ' -> '.join(parent_element_list) #Append p_blocks_df with the current paragraph data p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name ,"parent_hierarchy":parent_hierarchy ,"element_text":p_blocks[i].text ,"element_text_Count":len(str(p_blocks[i].text))} -
edemnati revised this gist
Aug 29, 2019 . 1 changed file with 3 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -49,7 +49,10 @@ # 3. concatenate paragraphs under the same parent hierarchy if len(p_blocks_df)>0: #Group paragraphs by parent_hierarchy p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy']) #Sum the paragraph lenght for each paragraph group p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum() p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True) -
edemnati revised this gist
Aug 28, 2019 . 1 changed file with 9 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,10 +16,11 @@ # 2. for each paragraph, construct its patents elements hierarchy #Create a dataframe to collect p_blocks data p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count']) # 2.1 loop for each paragraph block for i in range(0,len(p_blocks)): # Loop for each paragraph parent to extract its element name and id parents_list=[] for parent in p_blocks[i].parents: @@ -33,30 +34,29 @@ # Append the parent name and id to the parents table parents_list.append(parent.name + 'id: ' + Parent_id) # 2.2 Construct paragraph parent hierarchy parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ] parent_element_list.reverse() parent_hierarchy = ' -> '.join(parent_element_list) #Append p_blocks_df with the current paragraph data =p_blocks_df.append({"element_name":p_blocks[i].name ,"parent_hierarchy":parent_hierarchy ,"element_text":p_blocks[i].text ,"element_text_Count":len(str(p_blocks[i].text))} ,ignore_index=True ,sort=False) # 3. concatenate paragraphs under the same parent hierarchy if len(p_blocks_df)>0: p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy']) p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum() p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True) # 4. select the longest paragraph as the main article max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax() ,'parent_hierarchy'] merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list()) #Inspect the full Article text print(merge_text) -
edemnati created this gist
Aug 28, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,62 @@ import requests from bs4 import BeautifulSoup import pandas as pd #Test url url = 'https://www.theguardian.com/technology/2019/aug/28/apple-ends-contracts-hundreds-workers-hired-to-listen-siri' #Request the article url to get the web page content. article = requests.get(url) # 1. extract all paragraph elements inside the page body articles = BeautifulSoup(article.content, 'html.parser') articles_body = articles.findAll('body') p_blocks = articles_body[0].findAll('p') # 2. for each paragraph, construct its patents elements hierarchy #Create a dataframe to collect p_blocks data p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count']) for i in range(0,len(p_blocks)): # 2.1 Loop trough paragraph parents to extract its element name and id parents_list=[] for parent in p_blocks[i].parents: #Extract the parent id attribute if it exists Parent_id = '' try: Parent_id = parent['id'] except: pass # Append the parent name and id to the parents table parents_list.append(parent.name + 'id: ' + Parent_id) # 2.2 Construct parents hierarchy parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ] parent_element_list.reverse() parent_hierarchy = ' -> '.join(parent_element_list) #Append data table with the current paragraph data p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name ,"parent_hierarchy":parent_hierarchy ,"element_text":p_blocks[i].text ,"element_text_Count":len(str(p_blocks[i].text))} ,ignore_index=True ,sort=False) # 3. concatenate paragraphs under the same parent hierarchy if len(p_blocks_df)>0: p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy']) p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum() p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True) # 4. count paragraphs length # 5. select the longest paragraph as the main article max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax() ,'parent_hierarchy'] merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list()) #Article text print(merge_text)