Issue
I'm working with a data-set, so far i have made a histogram with a overlayed normal distribution curve.
I want to mark out the quartiles as in this image (the box plot is for reference). This is the code i'm working with:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
depDelay.sort()
plt.hist(depDelay, bins=100, normed=True)
hmean = np.mean(depDelay)
hstd = np.std(depDelay)
pdf = stats.norm.pdf(depDelay, hmean, hstd)
markers = [np.percentile(depDelay,50)]
plt.plot(DepDelay, pdf,'-o',markevery=markers)
plt.title('Distribution of Departure Delay')
plt.xlabel('Departure Delay (in mins)')
plt.ylabel('Frequency')
plt.savefig('depDelayNormDist.png')
plt.show()
How can i plot the same using matplotlib ?
Solution
I've tried to replicate the referenced image somewhat. Not sure what precisely you meant by marking the quartiles, but I've put in labels for Q1 and Q3 at the pdf and percentages in between the quartiles.
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
# dummy data
mu = 0
sigma = 1
n_bins = 50
s = np.random.normal(mu, sigma, 1000)
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
print(q1, median, q3)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
print(norm(mu, sigma).cdf(median))
print(norm(mu, sigma).pdf(median))
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('probability')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.subplots_adjust(hspace=0)
plt.show()
FYI, I've answered this similar question as well.
Answered By - Chris
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.