There are mainly two ways to extract data from a website:
- Use the API of the website (if it exists). For example, Facebook has the Facebook Graph API which allows retrieval of data posted on Facebook.
- Access the HTML of the webpage and extract useful information/data from it. This technique is called web scraping or web harvesting or web data extraction.
In this article, we will be using the API of newsapi. You can create your own API key by clicking here. Examples: Let’s determine the concern of a personality like states president cited by newspapers, let’s take the case of MERKEL
import pprint
import requests
secret = "Your API"
# Define the endpoint # Specify the query and # number of returns parameters = {
'q' : 'merkel' , # query phrase
'pageSize' : 100 , # maximum is 100
'apiKey' : secret # your own API key
} # Make the request response = requests.get(url,
params = parameters)
# Convert the response to # JSON format and pretty print it response_json = response.json()
pprint.pprint(response_json) |
Output:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text_combined = ''
for i in response_json[ 'articles' ]:
if i[ 'description' ] ! = None :
text_combined + = i[ 'description' ] + ' '
wordcount = {}
for word in text_combined.split():
if word not in wordcount:
wordcount[word] = 1
else :
wordcount[word] + = 1
for k,v, in sorted (wordcount.items(),
key = lambda words: words[ 1 ],
reverse = True ):
print (k,v)
|
Output:
bad_words = [“a”, “the”, “of”, “in”, “to”, “and”, “on”, “de”, “with”, “by”, “at”, “dans”, “ont”, “été”, “les”, “des”, “au”, “et”, “après”, “avec”, “qui”, “par”, “leurs”, “ils”, “a”, “pour”, “les”, “on”, “as”, “france”, “eux”, “où”, “son”, “le”, “la”, “en”, “with”, “is”, “has”, “for”, “that”, “an”, “but”, “be”, “are”, “du”, “it”, “à”, “had”, “ist”, “Der”, “um”, “zu”, “den”, “der”, “-“, “und”, “für”, “Die”, “von”, “als”, “sich”, “nicht”, “nach”, “auch” ]
Now we can delete and format the text by deleting bad words
# initializing bad_chars_list bad_words = ["a", "the" , "of", " in ", "to", " and ", "on", "de", "with",
"by", "at", "dans", "ont", "été", "les", "des", "au", "et",
"après", "avec", "qui", "par", "leurs", "ils", "a", "pour",
"les", "on", "as", "france", "eux", "où", "son", "le", "la",
"en", "with", " is ", "has", " for ", "that", "an", "but", "be",
"are", "du", "it", "à", "had", "ist", "Der", "um", "zu", "den",
"der", " - ", "und", "für", "Die", "von", "als",
"sich", "nicht", "nach", "auch" ]
r = text_combined.replace( '\s+' ,
' ' ).replace( ',' ,
' ' ).replace( '.' ,
' ' )
words = r.split()
rst = [word for word in words if
( word.lower() not in bad_words
and len (word) > 3 ) ]
rst = ' ' .join(rst)
wordcount = {}
for word in rst.split():
if word not in wordcount:
wordcount[word] = 1
else :
wordcount[word] + = 1
for k,v, in sorted (wordcount.items(),
key = lambda words: words[ 1 ],
reverse = True ):
print (k,v)
|
Output:
word = WordCloud(max_font_size = 40 ).generate(rst)
plt.figure() plt.imshow(word, interpolation = "bilinear")
plt.axis("off") plt.show() |
Output:
title_combined = ''
for i in response_json[ 'articles' ]:
title_combined + = i[ 'title' ] + ' '
titles = title_combined.replace( '\s+' ,
' ' ).replace( ',' ,
' ' ).replace( '.' ,
' ' )
words_t = titles.split()
result = [word for word in words_t if
( word.lower() not in bad_words and
len (word) > 3 ) ]
result = ' ' .join(result)
wordcount = {}
for word in result.split():
if word not in wordcount:
wordcount[word] = 1
else :
wordcount[word] + = 1
word = WordCloud(max_font_size = 40 ).generate(result)
plt.figure() plt.imshow(word, interpolation = "bilinear")
plt.axis("off") plt.show() |
Output: