aytvill · August 19, 2023 18:36 · Aug 29, 2019 · Aug 29, 2019 · Aug 29, 2019 · Aug 29, 2019
diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -19,7 +19,6 @@
 for i in range(0,len(p_blocks)):
 
   # 2.1 Loop trough paragraph parents to extract its element name and id
-
   parents_list=[]
   for parent in p_blocks[i].parents:
 

diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -16,11 +16,10 @@
 # 2. for each paragraph, construct its patents elements hierarchy
 #Create a dataframe to collect p_blocks data
 p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])
-
-# 2.1 loop for each paragraph block
 for i in range(0,len(p_blocks)):
 
-  # Loop for each paragraph parent to extract its element name and id
+  # 2.1 Loop trough paragraph parents to extract its element name and id
+
   parents_list=[]
   for parent in p_blocks[i].parents:
 
@@ -34,32 +33,27 @@
     # Append the parent name and id to the parents table
     parents_list.append(parent.name + 'id: ' + Parent_id)
 
-  # 2.2 Construct paragraph parent hierarchy
+  # 2.2 Construct parents hierarchy
   parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
   parent_element_list.reverse()
   parent_hierarchy = ' -> '.join(parent_element_list)
 
-  #Append p_blocks_df with the current paragraph data
+  #Append data table with the current paragraph data
   p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
                                   ,"parent_hierarchy":parent_hierarchy
                                   ,"element_text":p_blocks[i].text
                                   ,"element_text_Count":len(str(p_blocks[i].text))}
                                   ,ignore_index=True
                                   ,sort=False)
 
-# 3. concatenate paragraphs under the same parent hierarchy
+  # 3. concatenate paragraphs under the same parent hierarchy
 if len(p_blocks_df)>0:
-    #Group paragraphs by parent_hierarchy
     p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
-
-    #Sum the paragraph lenght for each paragraph group
     p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()            
     p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)            
 
-# 4. select the longest paragraph as the main article
-max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
+# 4. count paragraphs length
+# 5. select the longest paragraph as the main article
+maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
                                                      ,'parent_hierarchy']
-merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())
-
-#Inspect the full Article text
-print(merge_text)
+merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list())
diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -40,7 +40,7 @@
   parent_hierarchy = ' -> '.join(parent_element_list)
 
   #Append p_blocks_df with the current paragraph data
-  =p_blocks_df.append({"element_name":p_blocks[i].name
+  p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
                                   ,"parent_hierarchy":parent_hierarchy
                                   ,"element_text":p_blocks[i].text
                                   ,"element_text_Count":len(str(p_blocks[i].text))}

diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -49,7 +49,10 @@
 
 # 3. concatenate paragraphs under the same parent hierarchy
 if len(p_blocks_df)>0:
+    #Group paragraphs by parent_hierarchy
     p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
+
+    #Sum the paragraph lenght for each paragraph group
     p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()            
     p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)            
 

diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -16,10 +16,11 @@
 # 2. for each paragraph, construct its patents elements hierarchy
 #Create a dataframe to collect p_blocks data
 p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])
+
+# 2.1 loop for each paragraph block
 for i in range(0,len(p_blocks)):
 
-  # 2.1 Loop trough paragraph parents to extract its element name and id
-
+  # Loop for each paragraph parent to extract its element name and id
   parents_list=[]
   for parent in p_blocks[i].parents:
 
@@ -33,30 +34,29 @@
     # Append the parent name and id to the parents table
     parents_list.append(parent.name + 'id: ' + Parent_id)
 
-  # 2.2 Construct parents hierarchy
+  # 2.2 Construct paragraph parent hierarchy
   parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
   parent_element_list.reverse()
   parent_hierarchy = ' -> '.join(parent_element_list)
 
-  #Append data table with the current paragraph data
-  p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
+  #Append p_blocks_df with the current paragraph data
+  =p_blocks_df.append({"element_name":p_blocks[i].name
                                   ,"parent_hierarchy":parent_hierarchy
                                   ,"element_text":p_blocks[i].text
                                   ,"element_text_Count":len(str(p_blocks[i].text))}
                                   ,ignore_index=True
                                   ,sort=False)
 
-  # 3. concatenate paragraphs under the same parent hierarchy
+# 3. concatenate paragraphs under the same parent hierarchy
 if len(p_blocks_df)>0:
     p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
     p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()            
     p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)            
 
-# 4. count paragraphs length
-# 5. select the longest paragraph as the main article
+# 4. select the longest paragraph as the main article
 max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
                                                      ,'parent_hierarchy']
 merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())
 
-#Article text
+#Inspect the full Article text
 print(merge_text)
diff --git a/web_scraping_parse_articles.py b/web_scraping_parse_articles.py
@@ -0,0 +1,62 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+
+#Test url
+url = 'https://www.theguardian.com/technology/2019/aug/28/apple-ends-contracts-hundreds-workers-hired-to-listen-siri'
+
+#Request the article url to get the web page content.
+article = requests.get(url)
+
+# 1. extract all paragraph elements inside the page body
+articles = BeautifulSoup(article.content, 'html.parser')
+articles_body = articles.findAll('body')    
+p_blocks = articles_body[0].findAll('p')
+
+# 2. for each paragraph, construct its patents elements hierarchy
+#Create a dataframe to collect p_blocks data
+p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])
+for i in range(0,len(p_blocks)):
+
+  # 2.1 Loop trough paragraph parents to extract its element name and id
+
+  parents_list=[]
+  for parent in p_blocks[i].parents:
+
+    #Extract the parent id attribute if it exists
+    Parent_id = ''
+    try:
+      Parent_id = parent['id']
+    except:
+      pass
+
+    # Append the parent name and id to the parents table
+    parents_list.append(parent.name + 'id: ' + Parent_id)
+
+  # 2.2 Construct parents hierarchy
+  parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
+  parent_element_list.reverse()
+  parent_hierarchy = ' -> '.join(parent_element_list)
+
+  #Append data table with the current paragraph data
+  p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
+                                  ,"parent_hierarchy":parent_hierarchy
+                                  ,"element_text":p_blocks[i].text
+                                  ,"element_text_Count":len(str(p_blocks[i].text))}
+                                  ,ignore_index=True
+                                  ,sort=False)
+
+  # 3. concatenate paragraphs under the same parent hierarchy
+if len(p_blocks_df)>0:
+    p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
+    p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()            
+    p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)            
+
+# 4. count paragraphs length
+# 5. select the longest paragraph as the main article
+max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
+                                                     ,'parent_hierarchy']
+merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())
+
+#Article text
+print(merge_text)
No results found