import random as rand
import numpy as np
from scipy import stats
# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'
# Plot matplotlib plots with white background:
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
What is central limit theorem?
The distribution of the sum of independent samples consisting of n
points drawn from an arbitrary distribution approach a normal distribution as n
increases.
If the distribution of the values has a mean and standard deviation, the distribution of sum is approximately given by $ N(n, n^2)$
Some points to keep in mind: - The values are to be drawn independently - The values have to come from same distribution - The underlying distribution should have finite mean and variance - The rate convergence to the normal distribution depends on the skewness of the parent distribution.
We start with some crazy distribution that has got nothing to do with a normal distribution. Sample points from that distribution with some arbitrary sample size, following which we plot the sample mean (or sample sum) on a frequency table – repeat this lot of times (tending to infinity) we end up getting a normal distribution of sample means!
The Central Limit Theorem explains the prevalence of normal distributions in the natural world. This limit is central to the ideas of hypothesis testing and helpful for estimating confidence intervals.
Below a simple python experiment to show this in action.
import matplotlib.pyplot as plt
import seaborn as sns
= {
plot_params 'font.size' : 10,
'axes.titlesize' : 10,
'axes.labelsize' : 10,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 10,
'ytick.labelsize' : 10,
}
plt.rcParams.update(plot_params)
'colorblind') sns.color_palette(
from numpy.random import default_rng
= default_rng(42) rng
1. Discrete distribution
For this case let’s assume we have a dice which is unfair and does not ever land on 3 and 5, and lands more on 2 and 6. We can build this skewed probability into the dice using the weights.
= np.arange(1,7) # Dice numbers possible
dice = [0.2, 0.3, 0.0, 0.2, 0.0, 0.3] #Weighted probabilites for the numbers probabilities
Define a function to draw samples from the dice and calculate the mean.
# Draw sample size = n, take the mean and plot the frequencies
def sample_draw_mean(_trials=1000, _sample_size=1):
= []
sample_mean_trials # Sample a number from the distribution equal to trials
for i in range(_trials):
= rng.choice(dice, size=_sample_size, p=probabilities, replace=True)
sample
sample_mean_trials.append(np.mean(sample))return sample_mean_trials
Drawing sample_size
=1 from the distribution multiple times, i.e. equal to num_of_trials
variable
= 1000
num_of_trials = 1
sample_size =num_of_trials, _sample_size=sample_size), bins=len(dice), stat='density', kde=True);
sns.histplot(sample_draw_mean(_trials'Visualize sample mean distribution for {} sample drawn {} times'.format(sample_size, num_of_trials), fontsize=15); plt.title(
For sample size of 1, the frequency of numbers rolled by the unfair dice relates to the probability we have set above. However we can start to define samples from that distribution wherein, instead of single number we draw (for example 4).
Plotting sampling distribution of sample mean
= 1000
num_of_trials = 4
sample_size =num_of_trials, _sample_size=sample_size), bins=len(dice), stat='density', kde=True);
sns.histplot(sample_draw_mean(_trials'Visualize sample mean distribution for {} sample drawn {} times'.format(sample_size, num_of_trials), fontsize=15); plt.title(
= 1000
num_of_trials = 20
sample_size =num_of_trials, _sample_size=sample_size), bins=len(dice), stat='density', kde=True);
sns.histplot(sample_draw_mean(_trials'Visualize sample mean distribution for {} sample drawn {} times'.format(sample_size, num_of_trials), fontsize=15); plt.title(
As we keep plotting the frequency distribution for the sample mean it starts to approach the normal distribution!
def normal_distribution(x, mean=0, sigma=1):
= (1/np.sqrt(2 * np.pi * sigma ** 2)) * np.exp(-1/2 * ((x - mean)/sigma)**2)
out return(out)
= 1000
num_of_trials = 20
sample_size
= plt.subplots(1,1, figsize=(5,5))
fig, ax = np.sort(sample_draw_mean(_trials=num_of_trials, _sample_size=sample_size))
sample_means # Plot histogram density
=len(dice), stat='density', kde=False, ax=ax)
sns.histplot(sample_means, bins# Plot normal distribution
='black', linestyle='--', label='Normal Distribution')
ax.plot(sample_means, normal_distribution(sample_means, np.mean(sample_means), np.std(sample_means)), color# Plot sample mean
='red', linestyle='--', linewidth=2.0, label='Sample Mean')
ax.axvline(np.mean(sample_means), color'Dice number')
ax.set_xlabel('Visualize sample mean distribution for {} sample drawn {} times'.format(sample_size, num_of_trials), fontsize=15);
plt.title( plt.tight_layout()
2. Continuous distibution
# Define an exponential distribution
= 5.0
beta = 1000
num_of_trials = [1, 10, 100, 500] sample_size_list
def generate_mean_samples(_beta, _iter, _sample_size):
= []
samples_mean for i in range(_iter):
= np.random.exponential(_beta, _sample_size)
sample_numbers
samples_mean.append(np.mean(sample_numbers))return(samples_mean)
= []
sample_plot_list for n in sample_size_list:
sample_plot_list.append((n, generate_mean_samples(beta, num_of_trials, n)))
= plt.subplots(2,2, figsize=(10,10))
fig, ax = ax.flatten()
ax for i, entry in enumerate(sample_plot_list):
1], stat='density', alpha=0.6, kde=False, ax=ax[i])
sns.histplot(entry['Sample size: {}'.format(entry[0]))
ax[i].set_title(= np.mean(entry[1])
sample_mean = np.std(entry[1])
sample_std = np.sort(entry[1])
normal_x # Plot normal distribution
=4.0, color='black', linestyle='--', label='Normal Distribution')
ax[i].plot(normal_x, normal_distribution(normal_x,sample_mean,sample_std), linewidth
# Sample mean
='red', linestyle='--', linewidth=4.0, label='Sample Mean')
ax[i].axvline(sample_mean, color'Sample Mean')
ax[i].set_xlabel(r'Visualize sample mean distribution for Exponential distribution $\beta$={}, Sampled {} times'.format(beta, num_of_trials));
plt.suptitle(=(1.04,1), loc="upper left")
plt.legend(bbox_to_anchor#plt.tight_layout()
3. Is it a fair coin?
Estimate coin toss probability
A coin is flipped 30 times, you get 22 heads. Find if the coin is fair or not. That is, if the probability of getting heads-tails is 50%.
This can be solved by estimating the probability of getting heads / tails provided the above condition is met.
Since we can model the coin toss process (a priori model) using Bernoulli’s distribution, we will estimate the probability of 22 heads considering a fair coin. This will be our Null Hypothesis.
Null hypothesis: The null hypothesis is a model of the system based on the assumption that the apparent effect was actually due to chance.
Assuming a bernoulli distribution:
\[X_{i} \sim B(p)\]
\[ P(N_H=22) = \binom nx p^{22}(1-p)^{30-22} \]
By central limit theorem: \[ \sum_{i=1}^{30}{X_{i}} \sim N(30p, 30(1-p)) \]
From maximum likelihood estimate, more detailts on MLE can be found here.:
\[ \hat{p} = 0.73 \]
Estimate 95% confidence interval: * Assuming a normal distribution: \[ \mu \pm 1.96 \sigma \]
\[ 30\hat{p} \pm 1.96 \sqrt{ 30 * (1-\hat{p}) } \]
\[ 22 \pm 1.96 \sqrt{( 30 * 0.26 )} = (16.4, 27.58) \]
= np.random.default_rng(42) rng
Define a numpy.random.choice
function to simulate coin tosses. This can repeated to 30 times.
= rng.choice([0,1], replace=True, size=30) # we can randint(2) here as well. sampling_30
np.where
is used to find the entries with heads, that way for each 30 coin tosses we can estimate how many heads are there. In this case we are treating heads as 1 and tails as 0
len(np.where(sampling_30 == 1)[0]) # or just sum the list since all we have is 1 / 0
15
sum(sampling_30)
15
Setup the problem to perform multiple trails of 30 coin tosses, when done with the trials we will keep an account of how many of those trials had 22 heads.
= 22
heads_condition = []
num_heads_list = 0
constraint_satisy = 5000 num_trials
for _ in range(num_trials):
= rng.choice([0,1], replace=True, size=30, p=[0.50,0.50]) # A-priori fair coin toss model
sampling_30 = len(np.where(sampling_30 == 1)[0])
number_of_heads
num_heads_list.append(number_of_heads)
if number_of_heads == heads_condition:
= constraint_satisy + 1
constraint_satisy
= np.array(num_heads_list) num_heads_list
len(num_heads_list)
5000
Defining a normal distribution function from scipy
or we could also use the function defined previously.
from scipy.stats import norm
= np.linspace(min(num_heads_list), max(num_heads_list))
x = norm(np.mean(num_heads_list), np.std(num_heads_list)) std_norm_coin
= np.quantile(num_heads_list, [0.025, 0.975]) quantiles_95_confidence
= plt.subplots(1,1, figsize=(8,5))
fig, ax
# Plot histogram density
='density', kde=False, ax=ax)
sns.histplot(num_heads_list, stat
# Plot normal distribution
='black', linestyle='--', label='Normal Distribution')
ax.plot(x, std_norm_coin.pdf(x), color
# Plot sample mean
='red', linestyle='--', linewidth=2.0, label='Sample Mean')
ax.axvline(np.mean(num_heads_list), color='blue', linestyle='-', linewidth=2.0, label='Experiment condition')
ax.axvline(heads_condition, color
# Plot confidence interval
0], quantiles_95_confidence[1], alpha=0.15, color='yellow',label='95% confidence interval')
ax.axvspan(quantiles_95_confidence[
'Number of heads in 30 coin tosses')
ax.set_xlabel('Visualize distribution of number of heads for 30 coin tosses sampled {} times'.format(num_trials), fontsize=15);
plt.title(="upper left")
plt.legend(loc plt.tight_layout()
p-value estimate
= constraint_satisy / num_trials
p_value print(p_value)
0.0042
Since p-value is less than 0.05, this means the coin is not fair
For most problems, we only care about the order of magnitude: if the p-value is smaller that 1/100, the effect is likely to be real; if it is greater than 1/10, probably not. If you think there is a difference between a 4.8% (significant!) and 5.2% (not significant!), you are taking it too seriously.