1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
#!/usr/bin/python3
import sys
import time
import json
import requests
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4.element import Comment
from bs4 import BeautifulSoup
url = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/scores?date=2022-11-20'
year = '2022'
if len(sys.argv) != 3:
print("You need to provide 2 arguments: filename and time between scrapes")
exit(1)
def day_at(when):
date = datetime.now()
if when == "Yesterday":
date -= timedelta(days = 1)
elif when == "Tomorrow":
date += timedelta(days = 1)
return date.strftime("%a, %b %d")
def write_as_json(filename, data, indent = None):
data = {"updated": int(datetime.timestamp(datetime.now())), "days": data}
if filename == "": # return as string
print(json.dumps(data, default=lambda o: o.__dict__, indent = indent))
return
f = open(filename, "w")
json.dump(data, f, default=lambda o: o.__dict__, indent = indent)
f.close()
class Day:
def __init__(self, day, matches):
self.day = day
self.matches = matches
class Match:
def __init__(self, matches):
data = matches[1].div.contents
self.group = matches[0].text.lower().title() #
self.team1 = data[0].contents[1].span.text.lower().title()
self.team2 = data[1].contents[1].span.text.lower().title()
score1 = data[0].contents[3]
score2 = data[1].contents[3]
self.score1 = 0 if type(score1) is Comment else int(score1.span.text)
self.score2 = 0 if type(score2) is Comment else int(score2.span.text)
time = matches[1].contents[1].text
state = matches[2].span.text
self.time = time if time != "" else (state if state != "FINAL" else "Ended")
def print(self):
print(self.group)
print(self.team1, str(self.score1))
print(self.team2, str(self.score2))
print(self.time)
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options, executable_path='chromedriver')
def scrape(filename = ""):
driver.get(url)
time.sleep(6)
soup = BeautifulSoup(driver.page_source, "lxml")
days = []
for day in soup.find_all('div', class_="score-data score-section header-pinned"):
matches = []
for match in day.select(".score-chip-content"):
matches.append(Match(match.contents))
date = "Sun, Nov 20" # day doesnt have the date in its header
if type(day.div.string) is not Comment:
date = day.div.div.text.lower().title() # otherwise its all caps
if len(date.split()) < 3: # if its today tomorrow or yesterday
date = day_at(date)
days.append(Day(date, matches))
write_as_json(filename, days, 2)
while 1:
scrape(sys.argv[1])
time.sleep(int(sys.argv[2]))
|