"""Statistické testování průměrné délky slova v textech: normalita rozdělení a její ověření, t-test, Mann-Whitney-Wilcox test""" # je treba nainstalovat scipy a matplotlib # py -m pip install scipy # text_01 with open('texty/text_01.txt', encoding='UTF-8') as f: obsah = f.read() text_bez_interpunkce = obsah.translate(str.maketrans({',':'', '.':'', '!':'', '?':'', '(':'', ')':'', '„':'', '“':'', '–':'' })) # print(text_bez_interpunkce) text_bez_interpunkce_lc = text_bez_interpunkce.lower() # print(text_bez_interpunkce_lc) text_01 = text_bez_interpunkce_lc.split() print(text_01) # text_02 with open('texty/text_02.txt', encoding='UTF-8') as f: obsah = f.read() text_bez_interpunkce = obsah.translate(str.maketrans({',':'', '.':'', '!':'', '?':'', '(':'', ')':'', '„':'', '“':'', '–':'' })) # print(text_bez_interpunkce) text_bez_interpunkce_lc = text_bez_interpunkce.lower() # print(text_bez_interpunkce_lc) text_02 = text_bez_interpunkce_lc.split() print(text_02) # delky slov v poctu pismen delky_slov_01 = [] for slovo in text_01: delky_slov_01.append(len(slovo)) print(delky_slov_01) delky_slov_02 = [] for slovo in text_02: delky_slov_02.append(len(slovo)) print(delky_slov_02) # vypocitame prumer, median, modus a SD u obou textu import statistics import numpy # pro tokeny statistics.mean(delky_slov_01) statistics.stdev(delky_slov_01) statistics.median(delky_slov_01) statistics.mode(delky_slov_01) mean_L_text_01 = statistics.mean(delky_slov_01) mean_L_text_02 = statistics.mean(delky_slov_02) median_L_text_01 = statistics.median(delky_slov_01) median_L_text_02 = statistics.median(delky_slov_02) mode_L_text_01 = statistics.mode(delky_slov_01) mode_L_text_02 = statistics.mode(delky_slov_02) sd_L_text_01 = numpy.std(delky_slov_01) sd_L_text_02 = numpy.std(delky_slov_02) # testovani noramlity rozdeleni, Shapiro test from scipy import stats # HO rika, ze data jsou normalne rozdelena, pokud p-value menzi ne 0.05, zamitame HO shapiro_01 = stats.shapiro(delky_slov_01) print(shapiro_01) # k samotnemu p-value a statisticke hodnote se dostaneme takto shapiro_01.pvalue shapiro_01.statistic shapiro_02 = stats.shapiro(delky_slov_02) print(shapiro_02) shapiro_02.pvalue # pro grafickou jednoduchou kontrolu # Quantile-Quantile Plot # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.probplot.html import pylab stats.probplot(delky_slov_02, dist="norm", plot=pylab) pylab.show() # histogram import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html plt.hist(delky_slov_02) plt.show() #ukazeme si, jak by to vypadalo na datech z normalni distribuce import numpy # genereujeme si normalne rozdelena data, loc je prumer, scale je SD # https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html normdata = numpy.random.normal(loc=5, scale=2, size=100) stats.shapiro(normdata) stats.probplot(normdata, dist="norm", plot=pylab) pylab.show() plt.hist(normdata) plt.show() # Mann-Whitney U test # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html mwwu_test = stats.mannwhitneyu(delky_slov_01, delky_slov_02) print(mwwu_test) mwwu_test.pvalue # pokud data normalne rozdelena - t-test (Two sample t-test (unpaired or independent t-test)) # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html stats.ttest_ind(delky_slov_01, delky_slov_02) # obrazky import matplotlib.pyplot import numpy # barplot from collections import Counter freq_lengths_01 = dict(Counter(delky_slov_01)) print(freq_lengths_01) lengths = list(freq_lengths_01.keys()) frequencies = list(freq_lengths_01.values()) plt.bar(lengths, frequencies) matplotlib.pyplot.xlabel('délky') matplotlib.pyplot.ylabel('frekvence') matplotlib.pyplot.show() # scatterplot matplotlib.pyplot.scatter(lengths, frequencies) matplotlib.pyplot.show() # boxplot # vytvore obrazku fig, axes = matplotlib.pyplot.subplots(nrows= 1, ncols= 2) axes[0].boxplot(delky_slov_01) axes[0].set_title('text_01') axes[0].set_ylabel('délka') axes[0].set_yticks(range(0,26,5)) axes[1].boxplot(delky_slov_02) axes[1].set_title('text_02') axes[1].set_yticks(range(0,26,5)) matplotlib.pyplot.show() # UKOL: otestujte rozdily delek slov pro typy