import pandas as pd 
import numpy as np
import scipy.stats as stats

base = pd.read_csv('data/cardio_base.csv')
base

alco = pd.read_csv('data/cardio_alco.csv', sep=';')
alco

covid = pd.read_csv('data/covid_data.csv')
covid

base['age'] = base['age']//365
base['age'] = base['age'].astype(int)
base

q1 = base[['age', 'weight']].groupby('age').mean()
q1 = q1.sort_values('weight', ascending=False)
q1 = q1.reset_index()
q1

max_weight = q1['weight'].max()
max_weight

75.87372076023392

min_weight = q1['weight'].min()
min_weight

59.0

diff = max_weight - min_weight
diff

16.873720760233923

diff_percent = (diff / min_weight) * 100
diff_percent

28.599526712260886

q2 = base[['age', 'cholesterol']].groupby('age').mean()
q2.reset_index(inplace=True)
q2['50+'] = q2['age'] > 50
q2

q2 = q2[['50+', 'cholesterol']].groupby('50+').mean()
q2.reset_index(inplace=True)
q2

diff = q2.iloc[1,1] / q2.iloc[0,1]
diff

1.1951082506505808

diff_percent = (diff - 1) * 100
diff_percent

19.51082506505808

q3 = base[['gender', 'smoke']].groupby('gender').sum()
q3.reset_index(inplace=True)
q3

diff = q3.iloc[1,1] / q3.iloc[0,1]
diff

6.587945879458794

round(diff)

7

tallest_1_percent = np.percentile(base['height'], 99)
tallest_1_percent

184.0

spearman_correlation = base.corr(method='spearman')
spearman_correlation

hi_spearman_correlation = spearman_correlation.unstack()
hi_spearman_correlation = hi_spearman_correlation.sort_values(ascending=False)
hi_spearman_correlation = hi_spearman_correlation[hi_spearman_correlation != 1.0]
hi_spearman_correlation = hi_spearman_correlation.reset_index()
hi_spearman_correlation = hi_spearman_correlation.iloc[0]
hi_spearman_correlation

level_0       ap_lo
level_1       ap_hi
0          0.735436
Name: 0, dtype: object

average_height = base['height'].mean()
average_height

164.35922857142856

std_dev_height = base['height'].std()
std_dev_height

8.210126364538038

q6 = base.copy()
q6['is_far'] = q6['height'].apply(lambda x: abs(x - average_height) > (2 * std_dev_height))
q6

q6 = q6[['is_far', 'id']].groupby('is_far').count()
q6.reset_index(inplace=True)
q6

total_people = q6['id'].sum()
total_people

70000

percent_far = (q6.iloc[1,1] / total_people) * 100
round(percent_far)

3

q7 = pd.merge(base, alco, on='id', how='left')
q7 = q7[~q7['alco'].isna()]
q7['alco'] = q7['alco'].astype(int)
q7

q7['50+'] = q7['age'] > 50
q7

q7 = q7[q7['50+']]
q7.reset_index(inplace=True, drop=True)
q7

q7 = q7[['id', 'alco']].groupby('alco').count()
q7.reset_index(inplace=True)
q7

total_people_50p = q7['id'].sum()
total_people_50p

36874

percent_alco = (q7.iloc[1,1] / total_people_50p) * 100
round(percent_alco)

5

# Get blood pressure data
men_bp = base[base['gender'] == 2][['ap_hi', 'ap_lo']]
women_bp = base[base['gender'] == 1][['ap_hi', 'ap_lo']]

# Calculate means
men_mean = men_bp.mean()
women_mean = women_bp.mean()

# Calculate standard errors
men_se = stats.sem(men_bp)
women_se = stats.sem(women_bp)

# Degrees of freedom
df_men = len(men_bp) - 1
df_women = len(women_bp) - 1

# Confidence intervals
confidence = 0.95
men_ci = stats.t.interval(confidence, df_men, men_mean, men_se)
women_ci = stats.t.interval(confidence, df_women, women_mean, women_se)

print(f"Men's blood pressure 95% confidence interval: {men_ci}")
print(f"Women's blood pressure 95% confidence interval: {women_ci}")

Men's blood pressure 95% confidence interval: (array([128.24715319,  97.90115989]), array([131.91059098, 103.20280415]))
Women's blood pressure 95% confidence interval: (array([126.68741225,  92.91733378]), array([129.59108544,  96.1282186 ]))

# Compare confidence intervals
if men_ci[0][0] > women_ci[1][0] and men_ci[0][1] > women_ci[1][1]:
    print("Men have higher blood pressure than women with 95% confidence.")
else:
    print("We cannot say with 95% confidence that men have higher blood pressure than women.")

We cannot say with 95% confidence that men have higher blood pressure than women.

# Get cholesterol data
smokers = base[base['smoke'] == 1][['cholesterol']]
non_smokers = base[base['smoke'] == 0][['cholesterol']]

# Calculate means
smokers_mean = smokers.mean()
non_smokers_mean = non_smokers.mean()

# Calculate standard errors
smokers_se = stats.sem(smokers)
non_smokers_se = stats.sem(non_smokers)

# Degrees of freedom
df_smokers = len(smokers) - 1
df_non_smokers = len(non_smokers) - 1

# Confidence intervals
confidence = 0.95
smokers_ci = stats.t.interval(confidence, df_smokers, smokers_mean, smokers_se)
non_smokers_ci = stats.t.interval(confidence, df_non_smokers, non_smokers_mean, non_smokers_se)

print(f"Smokers' cholesterol level 95% confidence interval: {smokers_ci}")
print(f"Non smokers' cholesterol level 95% confidence interval: {non_smokers_ci}")

Smokers' cholesterol level 95% confidence interval: (array([1.37252981]), array([1.40652676]))
Non smokers' cholesterol level 95% confidence interval: (array([1.35940534]), array([1.36995813]))

# Compare confidence intervals
if smokers_ci[0] > non_smokers_ci[1]:
    print("Smokers have higher cholesterol level than non smokers with 95% confidence.")
else:
    print("We cannot say with 95% confidence that smokers have higher cholesterol level than non smokers.")

Smokers have higher cholesterol level than non smokers with 95% confidence.

# Get blood pressure data
smokers = base[base['smoke'] == 1][['ap_hi', 'ap_lo']]
non_smokers = base[base['smoke'] == 0][['ap_hi', 'ap_lo']]

# Calculate means
smokers_mean = smokers.mean()
non_smokers_mean = non_smokers.mean()

# Calculate standard errors
smokers_se = stats.sem(smokers)
non_smokers_se = stats.sem(non_smokers)

# Degrees of freedom
df_smokers = len(smokers) - 1
df_non_smokers = len(non_smokers) - 1

# Confidence intervals
confidence = 0.95
smokers_ci = stats.t.interval(confidence, df_smokers, smokers_mean, smokers_se)
non_smokers_ci = stats.t.interval(confidence, df_non_smokers, non_smokers_mean, non_smokers_se)

print(f"Smokers' blood pressure 95% confidence interval: {smokers_ci}")
print(f"Non smokers' blood pressure 95% confidence interval: {non_smokers_ci}")

Smokers' blood pressure 95% confidence interval: (array([127.89991216,  95.55652567]), array([128.82111232, 103.99250983]))
Non smokers' blood pressure 95% confidence interval: (array([127.61102269,  94.85064074]), array([130.11183924,  97.80245885]))

# Compare confidence intervals
if smokers_ci[0][0] > non_smokers_ci[1][0] and smokers_ci[0][1] > non_smokers_ci[1][1]:
    print("Smokers have higher blood pressure than non-smokers with 95% confidence.")
else:
    print("We cannot say with 95% confidence that smokers have higher blood pressure than non-smokers.")

We cannot say with 95% confidence that smokers have higher blood pressure than non-smokers.

# Get weight data
smokers = base[base['smoke'] == 1][['weight']]
non_smokers = base[base['smoke'] == 0][['weight']]

# Calculate means
smokers_mean = smokers.mean()
non_smokers_mean = non_smokers.mean()

# Calculate standard errors
smokers_se = stats.sem(smokers)
non_smokers_se = stats.sem(non_smokers)

# Degrees of freedom
df_smokers = len(smokers) - 1
df_non_smokers = len(non_smokers) - 1

# Confidence intervals
confidence = 0.95
smokers_ci = stats.t.interval(confidence, df_smokers, smokers_mean, smokers_se)
non_smokers_ci = stats.t.interval(confidence, df_non_smokers, non_smokers_mean, non_smokers_se)

print(f"Smokers' weight 95% confidence interval: {smokers_ci}")
print(f"Non smokers' weight 95% confidence interval: {non_smokers_ci}")

Smokers' weight 95% confidence interval: (array([76.96963347]), array([77.7190681]))
Non smokers' weight 95% confidence interval: (array([73.79142756]), array([74.01327548]))

# Compare confidence intervals
if smokers_ci[0] < non_smokers_ci[1]:
    print("Smokers weight less than non smokers with 95% confidence.")
else:
    print("We cannot say with 95% confidence that smokers weight less than non smokers.")

We cannot say with 95% confidence that smokers weight less than non smokers.

italy = covid[covid['location'] == 'Italy'][['date', 'new_cases']]
italy.sort_values('date', inplace=True)
italy['total_cases'] = italy['new_cases'].cumsum()
italy.drop('new_cases', axis=1, inplace=True)
italy.reset_index(inplace=True, drop=True)
italy

germany = covid[covid['location'] == 'Germany'][['date', 'new_cases']]
germany.sort_values('date', inplace=True)
germany['total_cases'] = germany['new_cases'].cumsum()
germany.drop('new_cases', axis=1, inplace=True)
germany.reset_index(inplace=True, drop=True)
germany

merged = pd.merge(italy, germany, on='date', how='left')
merged.rename(columns={'total_cases_x': 'italy_cases', 'total_cases_y': 'germany_cases'}, inplace=True)
merged

merged['difference'] = merged['italy_cases'] - merged['germany_cases']
merged

q9 = merged[merged['difference'] > 10000]
q9.reset_index(inplace=True, drop=True)
q9.iloc[0]

date             2020-03-12
italy_cases           12462
germany_cases          1567
difference            10895
Name: 0, dtype: object

q10 = italy[italy['date'] >= '2020-02-28']
q10 = q10[q10['date'] <= '2020-03-20']
q10.reset_index(inplace=True, drop=True)
q10['days_passed'] = q10.index
q10

from scipy.optimize import curve_fit

x_data = q10['days_passed']
y_data = q10['total_cases']

# Define the form of the function we want to fit
def func(x, A, B):
    return A * np.exp(B * x)

# Use curve_fit to fit the function to the data
params, params_covariance = curve_fit(func, x_data, np.log(y_data))

A, B = params

print(f"A = {A}, B = {B}")

A = 6.982215309435218, B = 0.02156805534995767

day_2020_03_20 = q10[q10['date'] == '2020-03-20']['days_passed'].iloc[0]

# Calculate predicted number of cases
predicted_cases = np.exp(func(day_2020_03_20, A, B))
predicted_cases

58830.49142421084

# Get actual number of cases
actual_cases = q10[q10['days_passed'] == day_2020_03_20]['total_cases'].values[0]
actual_cases

41035

# Calculate difference
difference = predicted_cases - actual_cases

print(f"The difference between the predicted and the real number of cases on 2020-03-20 is {round(difference)}")

The difference between the predicted and the real number of cases on 2020-03-20 is 17795

q11 = covid[['location', 'new_deaths', 'population']].groupby('location').agg({'new_deaths': 'sum', 'population': 'mean'})
q11.rename(columns={'new_deaths': 'total_deaths'}, inplace=True)
q11['death_rate_percent'] = (q11['total_deaths'] / q11['population']) * 100
q11.sort_values('death_rate_percent', ascending=False, inplace=True)
q11.reset_index(inplace=True)
q11

q11.iloc[2]['location']

'Andorra'

q12 = covid[['location', 'new_deaths', 'population', 'aged_65_older_percent']].groupby('location').agg({'new_deaths': 'sum', 'population': 'mean', 'aged_65_older_percent': 'mean'})
q12.rename(columns={'new_deaths': 'total_deaths'}, inplace=True)
q12['more_than_20_percent'] = q12['aged_65_older_percent'] > 20
q12['death_rate_per_million'] = (q12['total_deaths'] / q12['population']) * 1000000
q12.sort_values('death_rate_per_million', ascending=False, inplace=True)
q12.dropna(inplace=True)
q12.reset_index(inplace=True)
q12

q12.drop(columns=['total_deaths', 'population', 'aged_65_older_percent'], inplace=True)
q12['prediction'] = q12.apply(lambda x : True if x['more_than_20_percent'] == True and x['death_rate_per_million'] > 50 else False, axis=1)
q12['target'] = q12['death_rate_per_million'] > 50
q12

# True positives
tp = q12[(q12['prediction'] == True) & (q12['target'] == True)].shape[0]

# Number of all positives
p = q12[q12['prediction'] == True].shape[0]

# Number of all samples that should have been predicted as positive
t = q12[q12['target'] == True].shape[0]

# Precision
precision = tp / p

# Recall
recall = tp / t

print(f"Precision: {precision}")
print(f"Recall: {recall}")

# F1 score
f1 = 2 * (precision * recall) / (precision + recall)
print(f"F1 score: {f1}")

Precision: 1.0
Recall: 0.10810810810810811
F1 score: 0.1951219512195122

q13 = covid[['location', 'gdp_per_capita', 'hospital_beds_per_thousand']].groupby('location').mean()
q13.dropna(inplace=True)
q13.sort_values('gdp_per_capita', ascending=False, inplace=True)
q13.reset_index(inplace=True)
q13

df_AB = q13[q13['gdp_per_capita'] > 10000]
df_AB = df_AB[df_AB['hospital_beds_per_thousand'] > 5]
df_AB

n_countries = covid['location'].nunique()
n_countries

212

pAB = df_AB.shape[0] / n_countries
pAB

0.10377358490566038

df_B = q13[q13['hospital_beds_per_thousand'] >= 5]
df_B

pB = df_B.shape[0] / n_countries
pB

0.12264150943396226

p = pAB / pB
round(p*100)

85

	id	age	gender	height	weight	ap_hi	ap_lo	cholesterol	smoke
0	0	18393	2	168	62.0	110	80	1	0
1	1	20228	1	156	85.0	140	90	3	0
2	2	18857	1	165	64.0	130	70	3	0
3	3	17623	2	169	82.0	150	100	1	0
4	4	17474	1	156	56.0	100	60	1	0
...	...	...	...	...	...	...	...	...	...
69995	99993	19240	2	168	76.0	120	80	1	1
69996	99995	22601	1	158	126.0	140	90	2	0
69997	99996	19066	2	183	105.0	180	90	3	0
69998	99998	22431	1	163	72.0	135	80	1	0
69999	99999	20540	1	170	72.0	120	80	2	0

	id	alco
0	44	0
1	45	0
2	46	0
3	47	0
4	49	0
...	...	...
56898	99949	0
56899	99950	0
56900	99951	0
56901	99952	0
56902	99954	0

	location	date	new_cases	new_deaths	population	aged_65_older_percent	gdp_per_capita	hospital_beds_per_thousand
0	Afghanistan	2019-12-31	0	0	38928341.0	2.581	1803.987	0.5
1	Afghanistan	2020-01-01	0	0	38928341.0	2.581	1803.987	0.5
2	Afghanistan	2020-01-02	0	0	38928341.0	2.581	1803.987	0.5
3	Afghanistan	2020-01-03	0	0	38928341.0	2.581	1803.987	0.5
4	Afghanistan	2020-01-04	0	0	38928341.0	2.581	1803.987	0.5
...	...	...	...	...	...	...	...	...
23077	International	2020-02-28	0	0	NaN	NaN	NaN	NaN
23078	International	2020-02-29	0	2	NaN	NaN	NaN	NaN
23079	International	2020-03-01	0	0	NaN	NaN	NaN	NaN
23080	International	2020-03-02	0	0	NaN	NaN	NaN	NaN
23081	International	2020-03-10	-9	1	NaN	NaN	NaN	NaN

	id	age	gender	height	weight	ap_hi	ap_lo	cholesterol	smoke
0	0	50	2	168	62.0	110	80	1	0
1	1	55	1	156	85.0	140	90	3	0
2	2	51	1	165	64.0	130	70	3	0
3	3	48	2	169	82.0	150	100	1	0
4	4	47	1	156	56.0	100	60	1	0
...	...	...	...	...	...	...	...	...	...
69995	99993	52	2	168	76.0	120	80	1	1
69996	99995	61	1	158	126.0	140	90	2	0
69997	99996	52	2	183	105.0	180	90	3	0
69998	99998	61	1	163	72.0	135	80	1	0
69999	99999	56	1	170	72.0	120	80	2	0

	id	age	gender	height	weight	ap_hi	ap_lo	cholesterol	smoke
id	1.000000	0.003002	0.003506	-0.001886	-0.001292	0.003258	-0.000238	0.005706	-0.003697
age	0.003002	1.000000	-0.020668	-0.083530	0.062952	0.220309	0.157768	0.141571	-0.047372
gender	0.003506	-0.020668	1.000000	0.533805	0.172480	0.063174	0.066374	-0.035730	0.338135
height	-0.001886	-0.083530	0.533805	1.000000	0.313569	0.019381	0.030534	-0.058940	0.197632
weight	-0.001292	0.062952	0.172480	0.313569	1.000000	0.277749	0.254420	0.137908	0.071676
ap_hi	0.003258	0.220309	0.063174	0.019381	0.277749	1.000000	0.735436	0.208722	0.028929
ap_lo	-0.000238	0.157768	0.066374	0.030534	0.254420	0.735436	1.000000	0.167401	0.026172
cholesterol	0.005706	0.141571	-0.035730	-0.058940	0.137908	0.208722	0.167401	1.000000	0.015220
smoke	-0.003697	-0.047372	0.338135	0.197632	0.071676	0.028929	0.026172	0.015220	1.000000

Turing Data Analysis Quiz¶

First Dataset, Cardiovascular data¶

Second Dataset, Covid19 cases¶

First Dataset: Cardiovascular data¶

Q1: How much heavier is the age group with the highest average weight than the age group with the lowest weight?¶

Q2: Do people over 50 have higher cholesterol levels than the rest?¶

Q3: Are men more likely to be a smoker than women?¶

Q4: How tall are the tallest 1% of people?¶

Q5: Which two features have the highest spearman rank correlation?¶

Q6: What percentage of people are more than 2 standard deviations far from the average height?¶

Q7: What percentage of the population over 50 years old consume alcohol?¶

Q8: Which of the following statements is true with 95% confidence?¶

a. Men have higher blood pressure than women?¶

b. Smokers have higher cholesterol level than non smokers?¶

c. Smokers have higher blood pressure than non-smokers¶

d. Smokers weight less than non smokers¶

Second Dataset, Covid19 cases¶

Q9: When did the difference in the total number of confirmed cases between Italy and Germany become more than 10000?¶

Look at the cumulative number of confirmed cases in Italy between 2020-02-28 and 2020-03-20. Fit an exponential function $ y = A * e^{Bx} $ to this set to express cumulative cases as a function of days passed, by minimizing squared loss.¶

Q10: What is the difference between the exponential curve and the total number of real cases on 2020-03-20?¶

Q11: Which country has the 3rd highest death rate?¶

Q12: What is the F1 score of the following statement:¶

Countries, where more than 20% of the population is over 65 years old, have death rates over 50 per million inhabitants.¶

Q13: What is the probability that a country with GDP over $10000, if we know that they have at least 5 hospital beds per 1000 inhabitants.¶

	age	weight
0	63	75.873721
1	61	75.707111
2	59	75.355207
3	55	75.266728
4	57	75.122789
5	51	75.098530
6	47	74.695753
7	53	74.641262
8	64	74.619113
9	58	74.619009
10	45	74.369430
11	56	74.268755
12	48	74.163059
13	62	74.047713
14	60	74.004312
15	43	73.983161
16	54	73.982996
17	49	73.929353
18	52	73.432174
19	50	73.065081
20	41	72.895849
21	44	72.731044
22	46	72.717292
23	39	72.291573
24	42	71.681523
25	40	70.914920
26	29	70.000000
27	30	59.000000

	age	cholesterol	50+
0	29	1.000000	False
1	30	1.000000	False
2	39	1.185955	False
3	40	1.173243	False
4	41	1.209669	False
5	42	1.167137	False
6	43	1.264894	False
7	44	1.204756	False
8	45	1.261620	False
9	46	1.220308	False
10	47	1.308147	False
11	48	1.246273	False
12	49	1.316945	False
13	50	1.280473	False
14	51	1.346496	True
15	52	1.315645	True
16	53	1.380041	True
17	54	1.331761	True
18	55	1.420932	True
19	56	1.396729	True
20	57	1.469072	True
21	58	1.415078	True
22	59	1.511465	True
23	60	1.443125	True
24	61	1.559384	True
25	62	1.450205	True
26	63	1.621345	True
27	64	1.463649	True

	id	age	gender	height	weight	ap_hi	ap_lo	cholesterol	smoke	alco
32	44	40	1	159	60.0	110	70	1	0	0
33	45	56	1	160	73.0	130	85	1	0	0
34	46	60	2	173	82.0	140	90	3	0	0
35	47	55	1	163	55.0	120	80	1	0	0
36	49	50	2	175	95.0	120	80	1	0	0
...	...	...	...	...	...	...	...	...	...	...
69962	99949	57	1	178	69.0	130	90	1	0	0
69963	99950	53	2	155	76.0	130	90	1	0	0
69964	99951	61	2	170	72.0	120	80	1	0	0
69965	99952	42	2	165	66.0	120	80	1	0	0
69966	99954	55	1	156	67.0	140	80	2	0	0

	location	total_deaths	population	death_rate_percent
0	San Marino	42	33938.0	0.123755
1	Belgium	9619	11589616.0	0.082997
2	Andorra	51	77265.0	0.066007
3	United Kingdom	40883	67886004.0	0.060223
4	Spain	27136	46754783.0	0.058039
...	...	...	...	...
207	Greenland	0	56772.0	0.000000
208	Grenada	0	112519.0	0.000000
209	Timor	0	1318442.0	0.000000
210	Uganda	0	45741000.0	0.000000
211	International	7	NaN	NaN

	location	more_than_20_percent	death_rate_per_million	prediction	target
0	Belgium	False	829.967102	False	True
1	United Kingdom	False	602.230174	False	True
2	Spain	False	580.389818	False	True
3	Italy	True	563.049467	True	True
4	Sweden	False	467.063461	False	True
...	...	...	...	...	...
179	Lesotho	False	0.000000	False	False
180	Laos	False	0.000000	False	False
181	Grenada	False	0.000000	False	False
182	Timor	False	0.000000	False	False
183	Uganda	False	0.000000	False	False

	location	gdp_per_capita	hospital_beds_per_thousand
0	Qatar	116935.600	1.20
1	Luxembourg	94277.965	4.51
2	Singapore	85535.383	2.40
3	Brunei	71809.251	2.70
4	Ireland	67335.293	2.96
...	...	...	...
155	Malawi	1095.042	1.30
156	Niger	926.000	0.30
157	Liberia	752.788	0.80
158	Burundi	702.225	0.80
159	Central African Republic	661.240	1.00

	location	gdp_per_capita	hospital_beds_per_thousand
16	Austria	45436.686	7.370
17	Germany	45229.245	8.000
21	Belgium	42658.576	5.640
24	Japan	39002.223	13.050
25	France	38605.671	5.980
29	South Korea	35938.374	12.270
33	Czech Republic	32605.906	6.630
36	Slovakia	30155.152	5.820
37	Lithuania	29524.265	6.560
42	Poland	27216.445	6.620
44	Hungary	26777.561	7.020
47	Latvia	25063.846	5.570
48	Russia	24765.954	8.050
51	Kazakhstan	24055.588	6.700
52	Romania	23313.199	6.892
54	Croatia	22669.797	5.540
62	Bulgaria	18563.307	7.454
65	Belarus	17167.967	11.000
66	Barbados	16978.068	5.800
68	Gabon	16562.413	6.300
79	Serbia	14048.881	5.609
89	Mongolia	11840.846	7.000

	gender	smoke
0	1	813
1	2	5356

	is_far	id
0	False	67665
1	True	2335

	alco	id
0	0	35064
1	1	1810

	date	total_cases
0	2019-12-31	0
1	2020-01-01	0
2	2020-01-02	0
3	2020-01-03	0
4	2020-01-04	0
...	...	...
158	2020-06-06	234531
159	2020-06-07	234801
160	2020-06-08	234998
161	2020-06-09	235278
162	2020-06-10	235561