ngrams word popularity
This commit is contained in:
parent
317531b098
commit
4ac823cbe2
4 changed files with 73 additions and 0 deletions
3
python/data science/poop, klutzy.csv
Normal file
3
python/data science/poop, klutzy.csv
Normal file
|
@ -0,0 +1,3 @@
|
|||
,word,stdev,mean,median,mode,range,q1,q3,iqr
|
||||
0,poop,2.599393484092581e-07,6.139207233218996e-07,6.637646704023479e-07,9.126888329547e-07,1.069131408420227e-06,4.1074716961020385e-07,8.147061285918815e-07,4.039589589816777e-07
|
||||
1,klutzy,7.003951820605788e-09,3.2869637653755984e-09,0.0,0.0,2.649079254370333e-08,0.0,1.0566731666248965e-10,1.0566731666248965e-10
|
|
BIN
python/data science/poop, klutzy.png
Normal file
BIN
python/data science/poop, klutzy.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 32 KiB |
69
python/data science/wordPopularity.py
Normal file
69
python/data science/wordPopularity.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
import statistics
|
||||
import requests
|
||||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy
|
||||
import os
|
||||
import pandas
|
||||
|
||||
pandas.options.display.float_format = '{:.10f}'.format
|
||||
|
||||
words = input('Please enter a list of words. Separate each word with a comma (:\n')
|
||||
print()
|
||||
startYear = 1800
|
||||
endYear = 2019
|
||||
years = range(startYear, endYear + 1)
|
||||
|
||||
response = requests.get('https://books.google.com/ngrams/json?content=%s&year_start=%s&year_end=%s&corpus=26&smoothing=3' % (words, startYear, endYear))
|
||||
data = json.loads(response.content)
|
||||
|
||||
frames = []
|
||||
|
||||
for x in data:
|
||||
frame = {}
|
||||
frame['word'] = x['ngram']
|
||||
frame['stdev'] = numpy.std(x['timeseries'])
|
||||
frame['mean'] = numpy.mean(x['timeseries'])
|
||||
frame['median'] = numpy.median(x['timeseries'])
|
||||
frame['mode'] = statistics.mode(x['timeseries'])
|
||||
frame['range'] = max(x['timeseries']) - min(x['timeseries'])
|
||||
frame['q1'] = numpy.percentile(x['timeseries'], 25)
|
||||
frame['q3'] = numpy.percentile(x['timeseries'], 75)
|
||||
frame['iqr'] = frame['q3'] - frame['q1']
|
||||
frames.append(frame)
|
||||
plt.plot(years, x['timeseries'], label=frame['word'])
|
||||
|
||||
df = pandas.DataFrame(frames)
|
||||
print(df)
|
||||
|
||||
# Summary Statistics
|
||||
for frame in frames:
|
||||
print()
|
||||
if frame['stdev'] == max([ f['stdev'] for f in frames ]):
|
||||
print('%s has the highest standard deviation!' % (frame['word']))
|
||||
elif frame['stdev'] == min([ f['stdev'] for f in frames ]):
|
||||
print('%s has the lowest standard deviation!' % (frame['word']))
|
||||
|
||||
# Save CSV
|
||||
dirName = os.path.dirname(os.path.realpath(__file__))
|
||||
wordList = ', '.join([f['word'] for f in frames])
|
||||
|
||||
while True:
|
||||
toSave = input('Would you like to save this data frame in a CSV? (y/n)')
|
||||
if toSave == 'y':
|
||||
df.to_csv('%s/%s.csv' % (dirName, wordList))
|
||||
break
|
||||
if toSave == 'n':
|
||||
break
|
||||
|
||||
# Save Graph
|
||||
while True:
|
||||
toSave = input('Would you like to save a graph of the data? (y/n)').lower()
|
||||
if toSave == 'y':
|
||||
plt.ticklabel_format(style='plain')
|
||||
plt.legend()
|
||||
plt.savefig('%s/%s.png' % (dirName, wordList), dpi=100)
|
||||
exit()
|
||||
elif toSave == 'n':
|
||||
exit()
|
||||
|
|
@ -12,6 +12,7 @@
|
|||
- [Data Science](python/data%20science)
|
||||
- [Fake Jobs Scraper](python/data%20science/fakejobs.py)
|
||||
- [Country Population vs Active Facebook Users in the Country](python/data%20science/fbPercentActive.py)
|
||||
- [ngrams Word Popularity](python/data%20science/wordPopularity.py)
|
||||
- [Calculators](python/calculators)
|
||||
- [Binomial Distribution](python/calculators/Binomial%20Distribution.py)
|
||||
- [Pearson's Product-Moment Correlation Coefficient](python/calculators/PMCC.py)
|
||||
|
|
Loading…
Reference in a new issue