import requests import pandas as pd from bs4 import BeautifulSoup import time import os TIER1PRICE = 4.99 TIER2PRICE = 9.99 TIER3PRICE = 24.99 TIER1EARNING = 2.50 TIER2EARNING = 5 TIER3EARNING = 15 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} def getRows(page): time.sleep(1) response = requests.get('https://twitchtracker.com/subscribers?page=%i' % (page), headers=headers) soup = BeautifulSoup(response.content, 'html.parser') trs = soup.select('#channels tr') if len(trs) > 0: del trs[0] print('Processed page %i' % (page)) return trs howMany = int(input('How many pages would you like to scrape? There are 20 streamers a page.\n')) data = [] for i in range(howMany): trs = getRows(i + 1) if len(trs) > 0: # Remove dividers from the table for count, tr in enumerate(trs, 1): if count % 11 == 0: i = trs.index(tr) del trs[i] for row in trs: frame = {} tds = row.select('td') # Filters out rows without full stas fullStats = True if len(tds) != 11: fullStats = False for td in tds: if td.text.strip().__contains__('?'): fullStats = False frame['channelName'] = tds[3].text.strip() # Fill in the stats if they are full if fullStats == True: tier1 = int(tds[8].text) tier2 = int(tds[9].text) tier3 = int(tds[10].text) prime = int(tds[6].text) totalCost = (tier1 * TIER1PRICE) + (tier2 * TIER2PRICE) + (tier3 * TIER3PRICE) totalEarnings = (tier1 * TIER1EARNING) + (tier2 * TIER2EARNING) + (tier3 * TIER3EARNING) frame['totalSubs'] = int(tds[4].text) frame['totalCostOfSubs'] = totalCost frame['twitchCuts'] = totalCost - totalEarnings frame['totalEarnings'] = totalEarnings + (prime * TIER1EARNING) # Append data.append(frame) df = pd.DataFrame(data, index=None) df.index += 1 df.to_csv(os.path.dirname(os.path.realpath(__file__)) + '/twitchsubs.csv')