scraper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

#!/usr/bin/python3
import sys
import time
import json
import requests

from datetime import datetime
from datetime import timedelta

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from bs4.element import Comment
from bs4 import BeautifulSoup

url = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/scores?date=2022-11-20'
year = '2022'

if len(sys.argv) != 3:
    print("You need to provide 2 arguments: filename and time between scrapes")
    exit(1)

def day_at(when):
    date = datetime.now()
    if when == "Yesterday":
        date -= timedelta(days = 1)
    elif when == "Tomorrow":
        date += timedelta(days = 1)
    return date.strftime("%a, %b %d")

def write_as_json(filename, data, indent = None):
    data = {"updated": int(datetime.timestamp(datetime.now())), "days": data}

    if filename == "": # return as string
        print(json.dumps(data, default=lambda o: o.__dict__, indent = indent))
        return

    f = open(filename, "w")
    json.dump(data, f, default=lambda o: o.__dict__, indent = indent)
    f.close()


class Day:
    def __init__(self, day, matches):
        self.day = day
        self.matches = matches

class Match:
    def __init__(self, matches):
        data = matches[1].div.contents
        self.group  = matches[0].text.lower().title() #
        self.team1  = data[0].contents[1].span.text.lower().title()
        self.team2  = data[1].contents[1].span.text.lower().title()
        score1 = data[0].contents[3]
        score2 = data[1].contents[3]
        self.score1 = 0 if type(score1) is Comment else int(score1.span.text)
        self.score2 = 0 if type(score2) is Comment else int(score2.span.text)
        time = matches[1].contents[1].text
        state = matches[2].span.text
        self.time = time if time != "" else (state if state != "FINAL" else "Ended")

    def print(self):
        print(self.group)
        print(self.team1, str(self.score1))
        print(self.team2, str(self.score2))
        print(self.time)

options = Options()
options.headless = True
driver = webdriver.Chrome(options=options, executable_path='chromedriver')

def scrape(filename = ""):
    driver.get(url)
    time.sleep(6)
    soup = BeautifulSoup(driver.page_source, "lxml")

    days = []
    for day in soup.find_all('div', class_="score-data score-section header-pinned"):
        matches = []
        for match in day.select(".score-chip-content"):
            matches.append(Match(match.contents))

        date = "Sun, Nov 20" # day doesnt have the date in its header
        if type(day.div.string) is not Comment:
            date = day.div.div.text.lower().title() # otherwise its all caps

        if len(date.split()) < 3: # if its today tomorrow or yesterday
            date = day_at(date)

        days.append(Day(date, matches))

    write_as_json(filename, days, 2)

while 1:
    scrape(sys.argv[1])
    time.sleep(int(sys.argv[2]))