GabrielPastorello · a4v2d4 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.venv/
diff --git a/BRScraper/nba.py b/BRScraper/nba.py
@@ -1,6 +1,8 @@
 import pandas as pd
 import warnings
 import re
+import requests
+from bs4 import BeautifulSoup
 from datetime import date
 
 dict_teams = {'Utah Jazz':'UTA','Phoenix Suns':'PHO',
@@ -61,28 +63,90 @@ def get_stats(season, info='per_game', playoffs=False, rename=False):
     else:
         comp = 'leagues'
 
-    url_stats = ['https://www.basketball-reference.com/'+comp+'/NBA_'+str(season)+'_per_game.html', # pergame
-                'https://www.basketball-reference.com/'+comp+'/NBA_'+str(season)+'_totals.html', # total
-                'https://www.basketball-reference.com/'+comp+'/NBA_'+str(season)+'_advanced.html', # advanced
-                'https://www.basketball-reference.com/'+comp+'/NBA_'+str(season)+'_per_minute.html', # per 36 min
-                'https://www.basketball-reference.com/'+comp+'/NBA_'+str(season)+'_per_poss.html', # per 100 poss
-                ] 
+    # Construct URLs for different statistic types
+    url_stats = {
+        'per_game': f'https://www.basketball-reference.com/{comp}/NBA_{season}_per_game.html',
+        'totals': f'https://www.basketball-reference.com/{comp}/NBA_{season}_totals.html',
+        'advanced': f'https://www.basketball-reference.com/{comp}/NBA_{season}_advanced.html',
+        'per_36': f'https://www.basketball-reference.com/{comp}/NBA_{season}_per_minute.html',
+        'per_100': f'https://www.basketball-reference.com/{comp}/NBA_{season}_per_poss.html',
+    }
+
     try:
-        if info=='per_game':
-            df = pd.read_html(url_stats[0])[0]
-        elif info=='totals':
-            df = pd.read_html(url_stats[1])[0]
-        elif info=='advanced':
-            df = pd.read_html(url_stats[2])[0]
-            df = df.drop(['Unnamed: 24','Unnamed: 19'], axis=1).reset_index(drop=True)
-        elif info=='per_36':
-            df = pd.read_html(url_stats[3])[0]
-        elif info=='per_100':
-            df = pd.read_html(url_stats[4])[0]
-    except:
-        raise ValueError(str(season)+' is not a valid season.')
-
-    df = df[(df['Player'].notna())&(df['Player']!='Player')].drop(['Rk'], axis=1).reset_index(drop=True)
+        # Select the appropriate URL based on the 'info' parameter
+        url = url_stats[info]
+
+        # Fetch the HTML content of the page
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an error for bad status codes
+
+        # Parse the HTML content using BeautifulSoup
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Locate the table containing the statistics
+        table = soup.find('table')
+        if table is None:
+            raise ValueError(f"No table found on the page for season {season} and info '{info}'.")
+
+        tbody = table.find('tbody')
+        if tbody is None:
+            raise ValueError(f"No table body found on the page for season {season} and info '{info}'.")
+
+        # Extract all table rows
+        rows = tbody.find_all('tr')
+
+        # Initialize a list to store player_ids
+        player_ids = []
+
+        # Iterate over each row to extract the 'player_id'
+
+        for row in rows:
+            # Find the <td> element with 'data-stat' attribute 'name_display'
+            player_td = row.find('td', {'data-stat': 'name_display'})            
+            if not player_td:
+                player_td = row.find('td', {'data-stat': 'player'})
+
+            if player_td:
+                # Extract the player's name text
+                player_name = player_td.get_text(strip=True)
+
+                player_id = player_td.get('data-append-csv', None)
+                # print(player_id)
+                if player_id:
+                    player_ids.append(player_id)
+            # else:
+            #     print(row)
+
+        # Use pandas to read the table into a DataFrame
+        from io import StringIO
+        df = pd.read_html(StringIO(response.text))[0]
+
+        # Filter the DataFrame to exclude rows where 'Player' is NaN, 'Player', or 'League Average'
+        df = df[(df['Player'].notna()) &
+                (df['Player'] != 'Player') &
+                (df['Player'] != 'League Average')].reset_index(drop=True)
+
+        # Remove the original 'Rk' column
+        if 'Rk' in df.columns:
+            df = df.drop(['Rk'], axis=1)
+
+        df = df.drop(['Player'], axis=1)
+        df = df.drop(['Age'], axis=1)
+        df = df.drop(['FG%'], axis=1)
+        df = df.drop(['2P%'], axis=1)
+        df = df.drop(['3P%'], axis=1)
+        df = df.drop(['eFG%'], axis=1)
+        df = df.drop(['FT%'], axis=1)
+
+        # remove player_ids
+        # Insert the 'player_id' column as the first column
+        # print(player_ids)
+        df.insert(0, 'player_id', player_ids)
+
+    except requests.HTTPError as http_err:
+        raise ValueError(f"HTTP error occurred: {http_err}")
+    except Exception as e:
+        raise ValueError(f"An error occurred while fetching data for season {season}: {e}")
 
     if rename:
         cols = ['Player','Pos','Age','Tm','G','GS']
@@ -337,7 +401,40 @@ def get_player_stats(name):
                         Valid names would be "LeBron James", "lebron james" or "LEBRON JAMES" for example.''')
 
     try:
-        df = pd.read_html(url)[0]
+        # Fetch the page content
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise ValueError(
+                name
+                + " is not a valid name. Check for misspelling errors or if that player exists."
+            )
+        html = response.text
+
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Find the span with id='totals_link' and data-label='Totals'
+        totals_span = soup.find(
+            "span", attrs={"id": "totals_link", "data-label": "Totals"}
+        )
+
+        if totals_span is None:
+            raise ValueError("Totals data-label not found for " + name)
+
+        # Now, find the parent div with id='div_totals'
+        parent_div = soup.find("div", id="div_totals")
+
+        if parent_div is None:
+            raise ValueError("Totals table container not found for " + name)
+
+        # Now, find the table within this div
+        table = parent_div.find("table")
+
+        if table is None:
+            raise ValueError("Totals table not found for " + name)
+
+        # Read the table into pandas DataFrame
+        df = pd.read_html(str(table))[0]
     except:
         raise ValueError(name+' is not a valid name. Check for mispelling errors or if that players exists.')
 
@@ -405,7 +502,7 @@ def get_birthdays():
     month = today.month
     day = today.day
 
-    url = 'https://www.basketball-reference.com/friv/birthdays.fcgi?month='+str(day)+'&day='+str(month)
+    url = 'https://www.basketball-reference.com/friv/birthdays.fcgi?month='+str(month)+'&day='+str(day)
 
     try:
         df = pd.read_html(url)[0]

diff --git a/__pycache__/nba_teams.cpython-312.pyc b/__pycache__/nba_teams.cpython-312.pyc
diff --git a/__pycache__/test_nba_teams.cpython-312.pyc b/__pycache__/test_nba_teams.cpython-312.pyc