diff --git a/python/data science/poop, klutzy.csv b/python/data science/poop, klutzy.csv new file mode 100644 index 0000000..f83dfed --- /dev/null +++ b/python/data science/poop, klutzy.csv @@ -0,0 +1,3 @@ +,word,stdev,mean,median,mode,range,q1,q3,iqr +0,poop,2.599393484092581e-07,6.139207233218996e-07,6.637646704023479e-07,9.126888329547e-07,1.069131408420227e-06,4.1074716961020385e-07,8.147061285918815e-07,4.039589589816777e-07 +1,klutzy,7.003951820605788e-09,3.2869637653755984e-09,0.0,0.0,2.649079254370333e-08,0.0,1.0566731666248965e-10,1.0566731666248965e-10 diff --git a/python/data science/poop, klutzy.png b/python/data science/poop, klutzy.png new file mode 100644 index 0000000..aeb2fb5 Binary files /dev/null and b/python/data science/poop, klutzy.png differ diff --git a/python/data science/wordPopularity.py b/python/data science/wordPopularity.py new file mode 100644 index 0000000..7785a1d --- /dev/null +++ b/python/data science/wordPopularity.py @@ -0,0 +1,69 @@ +import statistics +import requests +import json +import matplotlib.pyplot as plt +import numpy +import os +import pandas + +pandas.options.display.float_format = '{:.10f}'.format + +words = input('Please enter a list of words. Separate each word with a comma (:\n') +print() +startYear = 1800 +endYear = 2019 +years = range(startYear, endYear + 1) + +response = requests.get('https://books.google.com/ngrams/json?content=%s&year_start=%s&year_end=%s&corpus=26&smoothing=3' % (words, startYear, endYear)) +data = json.loads(response.content) + +frames = [] + +for x in data: + frame = {} + frame['word'] = x['ngram'] + frame['stdev'] = numpy.std(x['timeseries']) + frame['mean'] = numpy.mean(x['timeseries']) + frame['median'] = numpy.median(x['timeseries']) + frame['mode'] = statistics.mode(x['timeseries']) + frame['range'] = max(x['timeseries']) - min(x['timeseries']) + frame['q1'] = numpy.percentile(x['timeseries'], 25) + frame['q3'] = numpy.percentile(x['timeseries'], 75) + frame['iqr'] = frame['q3'] - frame['q1'] + frames.append(frame) + plt.plot(years, x['timeseries'], label=frame['word']) + +df = pandas.DataFrame(frames) +print(df) + +# Summary Statistics +for frame in frames: + print() + if frame['stdev'] == max([ f['stdev'] for f in frames ]): + print('%s has the highest standard deviation!' % (frame['word'])) + elif frame['stdev'] == min([ f['stdev'] for f in frames ]): + print('%s has the lowest standard deviation!' % (frame['word'])) + +# Save CSV +dirName = os.path.dirname(os.path.realpath(__file__)) +wordList = ', '.join([f['word'] for f in frames]) + +while True: + toSave = input('Would you like to save this data frame in a CSV? (y/n)') + if toSave == 'y': + df.to_csv('%s/%s.csv' % (dirName, wordList)) + break + if toSave == 'n': + break + +# Save Graph +while True: + toSave = input('Would you like to save a graph of the data? (y/n)').lower() + if toSave == 'y': + plt.ticklabel_format(style='plain') + plt.legend() + plt.savefig('%s/%s.png' % (dirName, wordList), dpi=100) + exit() + elif toSave == 'n': + exit() + diff --git a/readme.md b/readme.md index 1f5ed31..cda93cf 100644 --- a/readme.md +++ b/readme.md @@ -12,6 +12,7 @@ - [Data Science](python/data%20science) - [Fake Jobs Scraper](python/data%20science/fakejobs.py) - [Country Population vs Active Facebook Users in the Country](python/data%20science/fbPercentActive.py) + - [ngrams Word Popularity](python/data%20science/wordPopularity.py) - [Calculators](python/calculators) - [Binomial Distribution](python/calculators/Binomial%20Distribution.py) - [Pearson's Product-Moment Correlation Coefficient](python/calculators/PMCC.py)