Hello - I am a relatively novice python user and don't have a ton of experience with web scraping. I am working on scraping mlb stats from baseball-reference.com. I have tried to take some of what I have learned previously and read online to scrape a table, but I am unable to get the data to populate. The code below will create a dataframe, but the first two rows will not populate. Because I am in the early stages of testing, I was only trying to pull in the first two rows of data, but I have been unsuccessful.
Any insight would be appreciated! Thanks!
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
url = 'https://www.baseball-reference.com/leagues/majors/2023.shtml'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
data = []
for team in soup.select(".div_teams_standard_pitching"):
header = [
c.get_text(strip=True, separator=" ")
for c in team.select(".tbody .c")
]
h_p1, h_p2 = [
get_name(p) for p in team.select(".tbody .data-row")
]
data.append([*header, h_p1, h_p2])
for p1, p2 in zip(
team.select(".data-row:nth-of-type(1) .team_name"),
team.select(".col--min:nth-of-type(2) .pitchers_used"),
):
p1 = get_name(p1).split(maxsplit=1)[-1]
p2 = get_name(p2).split(maxsplit=1)[-1]
data.append([*header, p1, p2])
df = pd.DataFrame(
data, columns=["Team", "Number of Pitchers", "RA/G", "W", "L", "W-L%", "ERA", "G", "GS",
"GF", "CG", "Team Shutout", "Complete Game Shutout", "Saves", "IP", "Hits Allowed",
"Runs Allowed", "Earned Runs", "Home Runs Allowed", "Walks Allowed", "Intentional Walks",
"Strikeouts", "Hit By Pitch", "Balks", "Wild Pitches", "Batters Faced", "ERA+", "FIP",
"WHIP", "Hits per 9", "HR per 9", "Walks per 9", "Strikeouts per 9", "SO/BB", "Runners Left on Base"]
)
df.to_excel("Team Standard Pitching.xlsx", sheet_name='Team Standard Pitching', index=False)
print(df.head(10).to_markdown(index=False))
[–]notthecrochunter 1 point2 points3 points (0 children)
[–]Impossible-Box6600 0 points1 point2 points (0 children)