diff options
author | kartofen <mladenovnasko0@gmail.com> | 2022-11-27 18:43:13 +0200 |
---|---|---|
committer | kartofen <mladenovnasko0@gmail.com> | 2022-11-27 18:43:13 +0200 |
commit | 4e96a8a67e1e9a4f85038f09ea77ec3645f85e73 (patch) | |
tree | d7b5e762910fa48e4026d6e02af94c5218d669a2 |
Big Bang
-rwxr-xr-x | scraper.py | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..a7a5c50 --- /dev/null +++ b/scraper.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 +import time +import json +import requests +from datetime import datetime +from datetime import timedelta + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +from bs4.element import Comment +from bs4 import BeautifulSoup + +url = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/scores?date=2022-11-20' +year = '2022' + +def day_at(when): + date = datetime.now() + if when == "Yesterday": + date -= timedelta(days = 1) + elif when == "Tomorrow": + date += timedelta(days = 1) + return date.strftime("%a, %b %d") + +def write_as_json(filename, data, indent = None): + data = {"updated": int(datetime.timestamp(datetime.now())), "days": data} + + if filename == "": # return as string + print(json.dumps(data, default=lambda o: o.__dict__, indent = indent)) + return + + f = open(filename, "w") + json.dump(data, f, default=lambda o: o.__dict__, indent = indent) + f.close() + + +class Day: + def __init__(self, day, matches): + self.day = day + self.matches = matches + +class Match: + def __init__(self, matches): + data = matches[1].div.contents + self.group = matches[0].text.lower().title() # + self.team1 = data[0].contents[1].span.text.lower().title() + self.team2 = data[1].contents[1].span.text.lower().title() + score1 = data[0].contents[3] + score2 = data[1].contents[3] + self.score1 = 0 if type(score1) is Comment else int(score1.span.text) + self.score2 = 0 if type(score2) is Comment else int(score2.span.text) + time = matches[1].contents[1].text + state = matches[2].span.text + self.time = time if time != "" else (state if state != "FINAL" else "Ended") + + def print(self): + print(self.group) + print(self.team1, str(self.score1)) + print(self.team2, str(self.score2)) + print(self.time) + +options = Options() +options.headless = True +driver = webdriver.Chrome(options=options, executable_path='chromedriver') + +def scrape(): + driver.get(url) + time.sleep(5) + soup = BeautifulSoup(driver.page_source, "lxml") + + days = [] + for day in soup.find_all('div', class_="score-data score-section header-pinned"): + matches = [] + for match in day.select(".score-chip-content"): + matches.append(Match(match.contents)) + + date = "Sun, Nov 20" # day doesnt have the date in its header + if type(day.div.string) is not Comment: + date = day.div.div.text.lower().title() # otherwise its all caps + + if len(date.split()) < 3: # if its today tomorrow or yesterday + date = day_at(date) + + days.append(Day(date, matches)) + + write_as_json("", days, 2) + +scrape() |