summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkartofen <mladenovnasko0@gmail.com>2022-11-27 18:43:13 +0200
committerkartofen <mladenovnasko0@gmail.com>2022-11-27 18:43:13 +0200
commit4e96a8a67e1e9a4f85038f09ea77ec3645f85e73 (patch)
treed7b5e762910fa48e4026d6e02af94c5218d669a2
Big Bang
-rwxr-xr-xscraper.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/scraper.py b/scraper.py
new file mode 100755
index 0000000..a7a5c50
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+import time
+import json
+import requests
+from datetime import datetime
+from datetime import timedelta
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from bs4.element import Comment
+from bs4 import BeautifulSoup
+
+url = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/scores?date=2022-11-20'
+year = '2022'
+
+def day_at(when):
+ date = datetime.now()
+ if when == "Yesterday":
+ date -= timedelta(days = 1)
+ elif when == "Tomorrow":
+ date += timedelta(days = 1)
+ return date.strftime("%a, %b %d")
+
+def write_as_json(filename, data, indent = None):
+ data = {"updated": int(datetime.timestamp(datetime.now())), "days": data}
+
+ if filename == "": # return as string
+ print(json.dumps(data, default=lambda o: o.__dict__, indent = indent))
+ return
+
+ f = open(filename, "w")
+ json.dump(data, f, default=lambda o: o.__dict__, indent = indent)
+ f.close()
+
+
+class Day:
+ def __init__(self, day, matches):
+ self.day = day
+ self.matches = matches
+
+class Match:
+ def __init__(self, matches):
+ data = matches[1].div.contents
+ self.group = matches[0].text.lower().title() #
+ self.team1 = data[0].contents[1].span.text.lower().title()
+ self.team2 = data[1].contents[1].span.text.lower().title()
+ score1 = data[0].contents[3]
+ score2 = data[1].contents[3]
+ self.score1 = 0 if type(score1) is Comment else int(score1.span.text)
+ self.score2 = 0 if type(score2) is Comment else int(score2.span.text)
+ time = matches[1].contents[1].text
+ state = matches[2].span.text
+ self.time = time if time != "" else (state if state != "FINAL" else "Ended")
+
+ def print(self):
+ print(self.group)
+ print(self.team1, str(self.score1))
+ print(self.team2, str(self.score2))
+ print(self.time)
+
+options = Options()
+options.headless = True
+driver = webdriver.Chrome(options=options, executable_path='chromedriver')
+
+def scrape():
+ driver.get(url)
+ time.sleep(5)
+ soup = BeautifulSoup(driver.page_source, "lxml")
+
+ days = []
+ for day in soup.find_all('div', class_="score-data score-section header-pinned"):
+ matches = []
+ for match in day.select(".score-chip-content"):
+ matches.append(Match(match.contents))
+
+ date = "Sun, Nov 20" # day doesnt have the date in its header
+ if type(day.div.string) is not Comment:
+ date = day.div.div.text.lower().title() # otherwise its all caps
+
+ if len(date.split()) < 3: # if its today tomorrow or yesterday
+ date = day_at(date)
+
+ days.append(Day(date, matches))
+
+ write_as_json("", days, 2)
+
+scrape()