Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save aytvill/0c53c9f7788f70453c51d9812b2ab615 to your computer and use it in GitHub Desktop.

Select an option

Save aytvill/0c53c9f7788f70453c51d9812b2ab615 to your computer and use it in GitHub Desktop.

Revisions

  1. @edemnati edemnati revised this gist Aug 29, 2019. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -19,7 +19,6 @@
    for i in range(0,len(p_blocks)):

    # 2.1 Loop trough paragraph parents to extract its element name and id

    parents_list=[]
    for parent in p_blocks[i].parents:

  2. @edemnati edemnati revised this gist Aug 29, 2019. 1 changed file with 9 additions and 15 deletions.
    24 changes: 9 additions & 15 deletions web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -16,11 +16,10 @@
    # 2. for each paragraph, construct its patents elements hierarchy
    #Create a dataframe to collect p_blocks data
    p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])

    # 2.1 loop for each paragraph block
    for i in range(0,len(p_blocks)):

    # Loop for each paragraph parent to extract its element name and id
    # 2.1 Loop trough paragraph parents to extract its element name and id

    parents_list=[]
    for parent in p_blocks[i].parents:

    @@ -34,32 +33,27 @@
    # Append the parent name and id to the parents table
    parents_list.append(parent.name + 'id: ' + Parent_id)

    # 2.2 Construct paragraph parent hierarchy
    # 2.2 Construct parents hierarchy
    parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
    parent_element_list.reverse()
    parent_hierarchy = ' -> '.join(parent_element_list)

    #Append p_blocks_df with the current paragraph data
    #Append data table with the current paragraph data
    p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
    ,"parent_hierarchy":parent_hierarchy
    ,"element_text":p_blocks[i].text
    ,"element_text_Count":len(str(p_blocks[i].text))}
    ,ignore_index=True
    ,sort=False)

    # 3. concatenate paragraphs under the same parent hierarchy
    # 3. concatenate paragraphs under the same parent hierarchy
    if len(p_blocks_df)>0:
    #Group paragraphs by parent_hierarchy
    p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])

    #Sum the paragraph lenght for each paragraph group
    p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()
    p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)

    # 4. select the longest paragraph as the main article
    max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
    # 4. count paragraphs length
    # 5. select the longest paragraph as the main article
    maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
    ,'parent_hierarchy']
    merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())

    #Inspect the full Article text
    print(merge_text)
    merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list())
  3. @edemnati edemnati revised this gist Aug 29, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -40,7 +40,7 @@
    parent_hierarchy = ' -> '.join(parent_element_list)

    #Append p_blocks_df with the current paragraph data
    =p_blocks_df.append({"element_name":p_blocks[i].name
    p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
    ,"parent_hierarchy":parent_hierarchy
    ,"element_text":p_blocks[i].text
    ,"element_text_Count":len(str(p_blocks[i].text))}
  4. @edemnati edemnati revised this gist Aug 29, 2019. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -49,7 +49,10 @@

    # 3. concatenate paragraphs under the same parent hierarchy
    if len(p_blocks_df)>0:
    #Group paragraphs by parent_hierarchy
    p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])

    #Sum the paragraph lenght for each paragraph group
    p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()
    p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)

  5. @edemnati edemnati revised this gist Aug 28, 2019. 1 changed file with 9 additions and 9 deletions.
    18 changes: 9 additions & 9 deletions web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -16,10 +16,11 @@
    # 2. for each paragraph, construct its patents elements hierarchy
    #Create a dataframe to collect p_blocks data
    p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])

    # 2.1 loop for each paragraph block
    for i in range(0,len(p_blocks)):

    # 2.1 Loop trough paragraph parents to extract its element name and id

    # Loop for each paragraph parent to extract its element name and id
    parents_list=[]
    for parent in p_blocks[i].parents:

    @@ -33,30 +34,29 @@
    # Append the parent name and id to the parents table
    parents_list.append(parent.name + 'id: ' + Parent_id)

    # 2.2 Construct parents hierarchy
    # 2.2 Construct paragraph parent hierarchy
    parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
    parent_element_list.reverse()
    parent_hierarchy = ' -> '.join(parent_element_list)

    #Append data table with the current paragraph data
    p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
    #Append p_blocks_df with the current paragraph data
    =p_blocks_df.append({"element_name":p_blocks[i].name
    ,"parent_hierarchy":parent_hierarchy
    ,"element_text":p_blocks[i].text
    ,"element_text_Count":len(str(p_blocks[i].text))}
    ,ignore_index=True
    ,sort=False)

    # 3. concatenate paragraphs under the same parent hierarchy
    # 3. concatenate paragraphs under the same parent hierarchy
    if len(p_blocks_df)>0:
    p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
    p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()
    p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)

    # 4. count paragraphs length
    # 5. select the longest paragraph as the main article
    # 4. select the longest paragraph as the main article
    max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
    ,'parent_hierarchy']
    merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())

    #Article text
    #Inspect the full Article text
    print(merge_text)
  6. @edemnati edemnati created this gist Aug 28, 2019.
    62 changes: 62 additions & 0 deletions web_scraping_parse_articles.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,62 @@
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd

    #Test url
    url = 'https://www.theguardian.com/technology/2019/aug/28/apple-ends-contracts-hundreds-workers-hired-to-listen-siri'

    #Request the article url to get the web page content.
    article = requests.get(url)

    # 1. extract all paragraph elements inside the page body
    articles = BeautifulSoup(article.content, 'html.parser')
    articles_body = articles.findAll('body')
    p_blocks = articles_body[0].findAll('p')

    # 2. for each paragraph, construct its patents elements hierarchy
    #Create a dataframe to collect p_blocks data
    p_blocks_df=pd.DataFrame(columns=['element_name','parent_hierarchy','element_text','element_text_Count'])
    for i in range(0,len(p_blocks)):

    # 2.1 Loop trough paragraph parents to extract its element name and id

    parents_list=[]
    for parent in p_blocks[i].parents:

    #Extract the parent id attribute if it exists
    Parent_id = ''
    try:
    Parent_id = parent['id']
    except:
    pass

    # Append the parent name and id to the parents table
    parents_list.append(parent.name + 'id: ' + Parent_id)

    # 2.2 Construct parents hierarchy
    parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
    parent_element_list.reverse()
    parent_hierarchy = ' -> '.join(parent_element_list)

    #Append data table with the current paragraph data
    p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
    ,"parent_hierarchy":parent_hierarchy
    ,"element_text":p_blocks[i].text
    ,"element_text_Count":len(str(p_blocks[i].text))}
    ,ignore_index=True
    ,sort=False)

    # 3. concatenate paragraphs under the same parent hierarchy
    if len(p_blocks_df)>0:
    p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
    p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()
    p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)

    # 4. count paragraphs length
    # 5. select the longest paragraph as the main article
    max_id=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
    ,'parent_hierarchy']
    merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==max_id,'element_text'].to_list())

    #Article text
    print(merge_text)