65 lines
1.9 KiB
Python
65 lines
1.9 KiB
Python
import requests
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import os
|
|
|
|
TIER1PRICE = 4.99
|
|
TIER2PRICE = 9.99
|
|
TIER3PRICE = 24.99
|
|
TIER1EARNING = 2.50
|
|
TIER2EARNING = 5
|
|
TIER3EARNING = 15
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
|
|
|
def getRows(page):
|
|
time.sleep(1)
|
|
response = requests.get('https://twitchtracker.com/subscribers?page=%i' % (page), headers=headers)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
trs = soup.select('#channels tr')
|
|
if len(trs) > 0:
|
|
del trs[0]
|
|
print('Processed page %i' % (page))
|
|
return trs
|
|
|
|
howMany = int(input('How many pages would you like to scrape? There are 20 streamers a page.\n'))
|
|
data = []
|
|
|
|
for i in range(howMany):
|
|
trs = getRows(i + 1)
|
|
if len(trs) > 0:
|
|
# Remove dividers from the table
|
|
for count, tr in enumerate(trs, 1):
|
|
if count % 11 == 0:
|
|
i = trs.index(tr)
|
|
del trs[i]
|
|
for row in trs:
|
|
frame = {}
|
|
tds = row.select('td')
|
|
# Filters out rows without full stas
|
|
fullStats = True
|
|
if len(tds) != 11:
|
|
fullStats = False
|
|
for td in tds:
|
|
if td.text.strip().__contains__('?'):
|
|
fullStats = False
|
|
frame['channelName'] = tds[3].text.strip()
|
|
# Fill in the stats if they are full
|
|
if fullStats == True:
|
|
tier1 = int(tds[8].text)
|
|
tier2 = int(tds[9].text)
|
|
tier3 = int(tds[10].text)
|
|
prime = int(tds[6].text)
|
|
totalCost = (tier1 * TIER1PRICE) + (tier2 * TIER2PRICE) + (tier3 * TIER3PRICE)
|
|
totalEarnings = (tier1 * TIER1EARNING) + (tier2 * TIER2EARNING) + (tier3 * TIER3EARNING)
|
|
frame['totalSubs'] = int(tds[4].text)
|
|
frame['totalCostOfSubs'] = totalCost
|
|
frame['twitchCuts'] = totalCost - totalEarnings
|
|
frame['totalEarnings'] = totalEarnings + (prime * TIER1EARNING)
|
|
# Append
|
|
data.append(frame)
|
|
|
|
df = pd.DataFrame(data, index=None)
|
|
df.index += 1
|
|
df.to_csv(os.path.dirname(os.path.realpath(__file__)) + '/twitchrevenue.csv')
|