import requests
from bs4 import BeautifulSoup
import lxml.html as lh
import pandas as pd
import re
import time
import psutil

import numpy as np
from PIL import Image
import os
from os import path
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import plotly.express as px

import matplotlib as plot
from matplotlib.pyplot import figure
import seaborn as sns
sns.set(style="whitegrid")


dataset = pd.DataFrame()
URL = 'https://www.worldometers.info/coronavirus/' #the website the data is extracted
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find(id='nav-tabContent')
table = table.find(id = 'nav-today')
table = table.find(id = '')
table = table.find(id = 'main_table_countries_today')
table_rows = table.find_all('tr') #finds all the tables with the tag tr in html
l = [] 
for tr in table_rows: #looping through all the table to get row data of each table
    td = tr.find_all('td') #finding all column for given tables
    row = [tr.text for tr in td]
    if len(row) == 0:
        continue  
    row = row[:8] #extraction of first 8 cloums
    l.append(row) #creating list of lists to represent the table data 
#creation of dataframe from the table of html
dataset = pd.DataFrame(l, columns=["Country","Total Cases","New Cases","Total Deaths","New Deaths","Total Recoverd","Active Cases","Serious Cases"])
def dataframeCleaner(dataset):
  
    for columnname in dataset: #looping through titles of the table 
        temp = []     
        for column in dataset[columnname]:   #geting column elements for the each title
            column = str(column)
            column = column.replace(',','')# Removing unwanted data clutter
            column = column.replace('+','')#Removing unwanted '+'sign  
            try:   #using try except block to convert datatype string to integer while avoiding error
                column = int(column)
            except:
                pass
            
            temp.append(column)
        dataset[columnname] = temp
        
    dataset = dataset.drop(dataset.tail(1).index) # Deleting the last row   
    dataset = dataset.replace(r'^\s*$', 0, regex=True)# converting empty string to 0
    return dataset
dataset
Country Total Cases New Cases Total Deaths New Deaths Total Recoverd Active Cases Serious Cases
0 \nNorth America\n 524,685 +24,465 19,297 +1,631 33,735 471,653 11,849
1 \nEurope\n 816,794 +37,942 69,904 +4,086 196,573 550,317 30,232
2 \nAsia\n 274,251 +10,354 10,221 +325 133,838 130,192 6,615
3 \nSouth America\n 44,490 +4,885 1,786 +171 4,596 38,108 876
4 \nOceania\n 7,573 +96 55 +3 3,515 4,003 79
... ... ... ... ... ... ... ... ...
223 Total: 44,490 +4,885 1,786 +171 4,596 38,108 876
224 Total: 7,573 +96 55 +3 3,515 4,003 79
225 Total: 13,418 +538 695 +63 2,217 10,506 153
226 Total: 721 13 619 89 10
227 Total: 1,681,932 +78,280 101,971 +6,279 375,093 1,204,868 49,814

228 rows × 8 columns

dataset = dataframeCleaner(dataset)
dataset
Country Total Cases New Cases Total Deaths New Deaths Total Recoverd Active Cases Serious Cases
0 \nNorth America\n 524685 24465 19297 1631 33735 471653 11849
1 \nEurope\n 816794 37942 69904 4086 196573 550317 30232
2 \nAsia\n 274251 10354 10221 325 133838 130192 6615
3 \nSouth America\n 44490 4885 1786 171 4596 38108 876
4 \nOceania\n 7573 96 55 3 3515 4003 79
... ... ... ... ... ... ... ... ...
222 Total: 274251 10354 10221 325 133838 130192 6615
223 Total: 44490 4885 1786 171 4596 38108 876
224 Total: 7573 96 55 3 3515 4003 79
225 Total: 13418 538 695 63 2217 10506 153
226 Total: 721 0 13 0 619 89 10

227 rows × 8 columns

filename = time.strftime("%Y%m%d")
dataset.to_csv
<bound method NDFrame.to_csv of                Country  Total Cases  New Cases  Total Deaths  New Deaths  \
0    \nNorth America\n       524685      24465         19297        1631   
1           \nEurope\n       816794      37942         69904        4086   
2             \nAsia\n       274251      10354         10221         325   
3    \nSouth America\n        44490       4885          1786         171   
4          \nOceania\n         7573         96            55           3   
..                 ...          ...        ...           ...         ...   
222             Total:       274251      10354         10221         325   
223             Total:        44490       4885          1786         171   
224             Total:         7573         96            55           3   
225             Total:        13418        538           695          63   
226             Total:          721          0            13           0   

     Total Recoverd  Active Cases  Serious Cases  
0             33735        471653          11849  
1            196573        550317          30232  
2            133838        130192           6615  
3              4596         38108            876  
4              3515          4003             79  
..              ...           ...            ...  
222          133838        130192           6615  
223            4596         38108            876  
224            3515          4003             79  
225            2217         10506            153  
226             619            89             10  

[227 rows x 8 columns]>
dataset = dataset.sort_values(by ='Total Cases', ascending = 0) # sorting the rows with respect to toatal cases
num_countries = 20
plotdata = dataset.drop(dataset.tail(len(dataset["Country"]) - num_countries).index)[["Country","Total Cases"]]
plotdata
Country Total Cases
7 World 1681932
221 Total: 816794
1 \nEurope\n 816794
0 \nNorth America\n 524685
220 Total: 524685
8 USA 491051
2 \nAsia\n 274251
222 Total: 274251
9 Spain 157053
10 Italy 147577
11 France 124869
12 Germany 120157
219 China 81907
13 UK 73758
14 Iran 68192
15 Turkey 47029
223 Total: 44490
3 \nSouth America\n 44490
16 Belgium 26667
17 Switzerland 24551
sns.set(rc={'figure.figsize':(18.7,8.27)})

cvstc = sns.barplot(x="Country", y="Total Cases", data = plotdata)
dataset = dataset.sort_values(by ='Total Deaths', ascending = 0) # sorting the rows with respect to toatal deaths
num_countries = 20
plotdata = dataset.drop(dataset.tail(len(dataset["Country"]) - num_countries).index)[["Country","Total Deaths"]]
plotdata
Country Total Deaths
7 World 101971
1 \nEurope\n 69904
221 Total: 69904
0 \nNorth America\n 19297
220 Total: 19297
10 Italy 18849
8 USA 18243
9 Spain 15970
11 France 13197
222 Total: 10221
2 \nAsia\n 10221
13 UK 8958
14 Iran 4232
219 China 3336
16 Belgium 3019
12 Germany 2688
18 Netherlands 2511
223 Total: 1786
3 \nSouth America\n 1786
20 Brazil 1057
sns.set(rc={'figure.figsize':(18.7,8.27)})

cvstc = sns.barplot(x="Country", y="Total Deaths", data = plotdata)
dataset = dataset.sort_values(by ='Total Cases', ascending = 0) # sorting the rows with respect to toatal cases
num_countries = 20
plotdata = dataset.drop(dataset.tail(len(dataset["Country"]) - num_countries).index)[['Country','Total Deaths','Total Cases']]
plotdata
Country Total Deaths Total Cases
7 World 101971 1681932
1 \nEurope\n 69904 816794
221 Total: 69904 816794
0 \nNorth America\n 19297 524685
220 Total: 19297 524685
8 USA 18243 491051
222 Total: 10221 274251
2 \nAsia\n 10221 274251
9 Spain 15970 157053
10 Italy 18849 147577
11 France 13197 124869
12 Germany 2688 120157
219 China 3336 81907
13 UK 8958 73758
14 Iran 4232 68192
15 Turkey 1006 47029
223 Total: 1786 44490
3 \nSouth America\n 1786 44490
16 Belgium 3019 26667
17 Switzerland 1002 24551
fig, ax1 = plt.subplots(figsize=(18.7,8.27))
color = 'tab:red'

ax1 = sns.barplot(x='Country', y='Total Cases', data = plotdata, palette='summer')
ax1.tick_params(axis='y')

ax2 = sns.set(style="ticks", rc={"lines.linewidth": 3.7})
ax2 = ax1.twinx()
ax2.set_ylabel('Avg Percipitation %', fontsize=16)
ax2 = sns.lineplot(x='Country', y='Total Deaths', data = plotdata, sort=False, color=color)
ax2.tick_params(axis='y', color=color)
#show plot
plt.show()
plotdata = dataset[['Country','Total Cases','Total Deaths','Total Recoverd']]
plotdata
Country Total Cases Total Deaths Total Recoverd
7 World 1681932 101971 375093
1 \nEurope\n 816794 69904 196573
221 Total: 816794 69904 196573
0 \nNorth America\n 524685 19297 33735
220 Total: 524685 19297 33735
... ... ... ... ...
215 Papua New Guinea 2 0 0
214 Caribbean Netherlands 2 0 0
216 Timor-Leste 2 0 1
217 Saint Pierre Miquelon 1 0 0
218 Yemen 1 0 0

227 rows × 4 columns

fig, ax = plt.subplots(figsize=(8.27,8.27))
sns.regplot('Total Cases','Total Deaths', data=plotdata, ax=ax)
ax2 = ax.twinx()
sns.regplot('Total Cases','Total Recoverd', data=plotdata, ax=ax2, color='r')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1a45c0710>
Download notebook

(2 downloads)

Post categories:

python