In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

print("Python", sys.executable)
print("Numpy", np.__version__)
print("Pandas",pd.__version__)
print("Seaborn",sns.__version__)
print("Matplotlib",matplotlib.__version__)
print("Matplotlib backend" matplotlib.get_backend())
Python /Users/mlubinsky/anaconda3/bin/python
Numpy 1.23.5
Pandas 1.5.3
Seaborn 0.12.2
Matplotlib 3.7.0
module://matplotlib_inline.backend_inline

Normal distribution¶

In [2]:
from scipy.stats import norm
fig, ax = plt.subplots()
x= np.arange(-4,4,0.001)
ax.set_title('N(0,$1^2$)')
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.plot(x, norm.pdf(x))
ax.set_ylim(0,0.45)
plt.show()
In [3]:
fig, ax = plt.subplots()
x = np.linspace(-10,10,100)
stdvs = [1.0, 2.0, 3.0, 4.0]
for s in stdvs:
    ax.plot(x, norm.pdf(x,scale=s), label='stdv=%.1f' % s)
    
ax.set_xlabel('x')
ax.set_ylabel('pdf(x)')
ax.set_title('Normal Distribution')
ax.legend(loc='best', frameon=True)
ax.set_ylim(0,0.45)
ax.grid(True)
In [4]:
fig, ax = plt.subplots()
xs = norm.rvs(scale=2,size=1000)
x = np.linspace(-10,10,100)
p = norm.pdf(x,scale=2)
v = np.var(xs)
m = np.mean(xs)
ax = fig.add_subplot(111)
ax.hist(xs, bins=10, alpha=0.5, density=True)
ax.plot(x,p, 'r-', lw=2)
ax.set_xlabel('x')
ax.set_ylabel('pdf(x)')
ax.set_title(f'mean={m:.2f}, var={v:.2f}')
ax.grid(True)
In [5]:
fig, ax = plt.subplots()
# for distribution curve
x= np.arange(-4,4,0.001)
ax.plot(x, norm.pdf(x))
ax.set_title("Cumulative normal distribution")
ax.set_xlabel('x')
ax.set_ylabel('pdf(x)')
ax.grid(True)
# for fill_between
px=np.arange(-4,1,0.01)
ax.set_ylim(0,0.5)
ax.fill_between(px,norm.pdf(px),alpha=0.5, color='g')
# for text
ax.text(-1,0.1,"cdf(x)", fontsize=20)
plt.show()

Calculating the probability of normal distribution¶

In [6]:
from scipy.stats import norm
lessthan2=norm.cdf(x=2, loc=3, scale=2)
print(lessthan2)
0.3085375387259869
In [7]:
fig, ax = plt.subplots()
# for distribution curve
x= np.arange(-4,10,0.001)
ax.plot(x, norm.pdf(x,loc=3,scale=2))
ax.set_title("N(3,$2^2$)")
ax.set_xlabel('x')
ax.set_ylabel('pdf(x)')
ax.grid(True)
# for fill_between
px=np.arange(-4,2,0.01)
ax.set_ylim(0,0.25)
ax.fill_between(px,norm.pdf(px,loc=3,scale=2),alpha=0.5, color='g')
# for text
ax.text(-0.5,0.02,round(lessthan2,2), fontsize=20)
plt.show()

https://towardsdatascience.com/exploring-normal-distribution-with-jupyter-notebook-3645ec2d83f8¶

https://karliris62.medium.com/probability-distribution-through-python-code-data-science-in-experiment-299ac949d342¶

Scipy Stat https://docs.scipy.org/doc/scipy/tutorial/stats.html¶

In [12]:
# There are > 120 distributions in SciPy
# https://towardsdatascience.com/probability-distributions-with-pythons-scipy-3da89bf60565

from scipy import stats
dist_continu = [d for d in dir(stats) if
                isinstance(getattr(stats, d), stats.rv_continuous)]
dist_discrete = [d for d in dir(stats) if
                 isinstance(getattr(stats, d), stats.rv_discrete)]
print('scipy.stat: number of continuous distributions: %d' % len(dist_continu))
print('scipy.stat: number of discrete distributions:   %d' % len(dist_discrete))
scipy.stat: number of continuous distributions: 107
scipy.stat: number of discrete distributions:   19

https://github.com/Probability-Statistics-Jupyter-Notebook/probability-statistics-notebook/tree/master¶

Statsmodels https://www.statsmodels.org/¶

Inplements regression, linear models, time series analysis, extensions to topics also covered by scipy.stats.

https://medium.com/geekculture/an-overview-of-11-classic-time-series-forecasting-methods-in-statsmodels-bc7728f657f4¶

In [11]:
import statsmodels.api as sm
data = sm.datasets.statecrime.load_pandas().data

import seaborn as sns
sns.set(style = "whitegrid")
fig = sns.lmplot(x="poverty", y="murder", data=data)

Pomegranate https://pomegranate.readthedocs.io/¶