# Import packages for the project
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns
import pandas as pd
import sys,os
import warnings
warnings.filterwarnings('ignore')

# Code from Pappalardo, L., Cintia, P., Rossi, A. et al
# loading the events data
events={}
nations = ['Italy','England','Germany','France','Spain']
for nation in nations:
    with open('events/events_%s.json' %nation) as json_data:
        events[nation] = json.load(json_data)
        
# loading the match data
matches={}
nations = ['Italy','England','Germany','France','Spain']
for nation in nations:
    with open('matches/matches_%s.json' %nation) as json_data:
        matches[nation] = json.load(json_data)

# loading the players data
players={}
with open('players.json') as json_data:
    players = json.load(json_data)

# loading the competitions data
competitions={}
with open('competitions.json') as json_data:
    competitions = json.load(json_data)

# Data files from Pappalardo, L., Cintia, P., Rossi, A. et al
# Read the csv files
df_matches = pd.read_csv("matches.csv")
df_events = pd.read_csv("events.csv")
df_players = pd.read_csv("players.csv")
df_competitions = pd.read_csv("competitions.csv")

# Code from Pappalardo, L., Cintia, P., Rossi, A. et al
# Pitch function
def pitch():
    """
    code to plot a soccer pitch 
    """
    #create figure
    fig,ax=plt.subplots(figsize=(7,5))
    
    #Pitch Outline & Centre Line
    plt.plot([0,0],[0,100], color="black")
    plt.plot([0,100],[100,100], color="black")
    plt.plot([100,100],[100,0], color="black")
    plt.plot([100,0],[0,0], color="black")
    plt.plot([50,50],[0,100], color="black")

    #Left Penalty Area
    plt.plot([16.5,16.5],[80,20],color="black")
    plt.plot([0,16.5],[80,80],color="black")
    plt.plot([16.5,0],[20,20],color="black")

    #Right Penalty Area
    plt.plot([83.5,100],[80,80],color="black")
    plt.plot([83.5,83.5],[80,20],color="black")
    plt.plot([83.5,100],[20,20],color="black")

    #Left 6-yard Box
    plt.plot([0,5.5],[65,65],color="black")
    plt.plot([5.5,5.5],[65,35],color="black")
    plt.plot([5.5,0.5],[35,35],color="black")

    #Right 6-yard Box
    plt.plot([100,94.5],[65,65],color="black")
    plt.plot([94.5,94.5],[65,35],color="black")
    plt.plot([94.5,100],[35,35],color="black")

    #Prepare Circles
    centreCircle = Ellipse((50, 50), width=30, height=39, edgecolor="black", facecolor="None", lw=1.8)
    centreSpot = Ellipse((50, 50), width=1, height=1.5, edgecolor="black", facecolor="black", lw=1.8)
    leftPenSpot = Ellipse((11, 50), width=1, height=1.5, edgecolor="black", facecolor="black", lw=1.8)
    rightPenSpot = Ellipse((89, 50), width=1, height=1.5, edgecolor="black", facecolor="black", lw=1.8)

    #Draw Circles
    ax.add_patch(centreCircle)
    ax.add_patch(centreSpot)
    ax.add_patch(leftPenSpot)
    ax.add_patch(rightPenSpot)
    
    #limit axis
    plt.xlim(0,100)
    plt.ylim(0,100)
    
    ax.annotate("", xy=(25, 5), xytext=(5, 5),
                arrowprops=dict(arrowstyle="->", linewidth=2))
    ax.text(7,7,'Attack',fontsize=20)
    return fig,ax

# Code from Pappalardo, L., Cintia, P., Rossi, A. et al
def draw_pitch(pitch, line, orientation, view):
    """
    Draw a soccer pitch given the pitch, the orientation, the view and the line
    
    Parameters
    ----------
    pitch
    
    """
    orientation = orientation
    view = view
    line = line
    pitch = pitch
    
    if orientation.lower().startswith("h"):
        
        if view.lower().startswith("h"):
            fig,ax = plt.subplots(figsize=(6.8,10.4))
            plt.xlim(49,105)
            plt.ylim(-1,69)
        else:
            fig,ax = plt.subplots(figsize=(10.4,6.8))
            plt.xlim(-1,105)
            plt.ylim(-1,69)
        ax.axis('off') # this hides the x and y ticks
    
        # side and goal lines #
        ly1 = [0,0,68,68,0]
        lx1 = [0,104,104,0,0]

        plt.plot(lx1,ly1,color=line,zorder=5)


        # boxes, 6 yard box and goals

            #outer boxes#
        ly2 = [13.84,13.84,54.16,54.16] 
        lx2 = [104,87.5,87.5,104]
        plt.plot(lx2,ly2,color=line,zorder=5)

        ly3 = [13.84,13.84,54.16,54.16] 
        lx3 = [0,16.5,16.5,0]
        plt.plot(lx3,ly3,color=line,zorder=5)

            #goals#
        ly4 = [30.34,30.34,37.66,37.66]
        lx4 = [104,104.2,104.2,104]
        plt.plot(lx4,ly4,color=line,zorder=5)

        ly5 = [30.34,30.34,37.66,37.66]
        lx5 = [0,-0.2,-0.2,0]
        plt.plot(lx5,ly5,color=line,zorder=5)


           #6 yard boxes#
        ly6 = [24.84,24.84,43.16,43.16]
        lx6 = [104,99.5,99.5,104]
        plt.plot(lx6,ly6,color=line,zorder=5)

        ly7 = [24.84,24.84,43.16,43.16]
        lx7 = [0,4.5,4.5,0]
        plt.plot(lx7,ly7,color=line,zorder=5)

        #Halfway line, penalty spots, and kickoff spot
        ly8 = [0,68] 
        lx8 = [52,52]
        plt.plot(lx8,ly8,color=line,zorder=5)


        plt.scatter(93,34,color=line,zorder=5)
        plt.scatter(11,34,color=line,zorder=5)
        plt.scatter(52,34,color=line,zorder=5)

        circle1 = plt.Circle((93.5,34), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=1,alpha=1)
        circle2 = plt.Circle((10.5,34), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=1,alpha=1)
        circle3 = plt.Circle((52, 34), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=2,alpha=1)

        ## Rectangles in boxes
        rec1 = plt.Rectangle((87.5,20), 16,30,ls='-',color=pitch, zorder=1,alpha=1)
        rec2 = plt.Rectangle((0, 20), 16.5,30,ls='-',color=pitch, zorder=1,alpha=1)

        ## Pitch rectangle
        rec3 = plt.Rectangle((-1, -1), 106,70,ls='-',color=pitch, zorder=1,alpha=1)

        ax.add_artist(rec3)
        ax.add_artist(circle1)
        ax.add_artist(circle2)
        ax.add_artist(rec1)
        ax.add_artist(rec2)
        ax.add_artist(circle3)
        
    else:
        if view.lower().startswith("h"):
            fig,ax = plt.subplots(figsize=(10.4,6.8))
            plt.ylim(49,105)
            plt.xlim(-1,69)
        else:
            fig,ax = plt.subplots(figsize=(6.8,10.4))
            plt.ylim(-1,105)
            plt.xlim(-1,69)
        ax.axis('off') # this hides the x and y ticks

        # side and goal lines #
        lx1 = [0,0,68,68,0]
        ly1 = [0,104,104,0,0]

        plt.plot(lx1,ly1,color=line,zorder=5)


        # boxes, 6 yard box and goals

            #outer boxes#
        lx2 = [13.84,13.84,54.16,54.16] 
        ly2 = [104,87.5,87.5,104]
        plt.plot(lx2,ly2,color=line,zorder=5)

        lx3 = [13.84,13.84,54.16,54.16] 
        ly3 = [0,16.5,16.5,0]
        plt.plot(lx3,ly3,color=line,zorder=5)

            #goals#
        lx4 = [30.34,30.34,37.66,37.66]
        ly4 = [104,104.2,104.2,104]
        plt.plot(lx4,ly4,color=line,zorder=5)

        lx5 = [30.34,30.34,37.66,37.66]
        ly5 = [0,-0.2,-0.2,0]
        plt.plot(lx5,ly5,color=line,zorder=5)


           #6 yard boxes#
        lx6 = [24.84,24.84,43.16,43.16]
        ly6 = [104,99.5,99.5,104]
        plt.plot(lx6,ly6,color=line,zorder=5)

        lx7 = [24.84,24.84,43.16,43.16]
        ly7 = [0,4.5,4.5,0]
        plt.plot(lx7,ly7,color=line,zorder=5)

        #Halfway line, penalty spots, and kickoff spot
        lx8 = [0,68] 
        ly8 = [52,52]
        plt.plot(lx8,ly8,color=line,zorder=5)


        plt.scatter(34,93,color=line,zorder=5)
        plt.scatter(34,11,color=line,zorder=5)
        plt.scatter(34,52,color=line,zorder=5)

        circle1 = plt.Circle((34,93.5), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=1,alpha=1)
        circle2 = plt.Circle((34,10.5), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=1,alpha=1)
        circle3 = plt.Circle((34,52), 9.15,ls='solid',lw=1.5,color=line, fill=False, zorder=2,alpha=1)


        ## Rectangles in boxes
        rec1 = plt.Rectangle((20, 87.5), 30,16.5,ls='-',color=pitch, zorder=1,alpha=1)
        rec2 = plt.Rectangle((20, 0), 30,16.5,ls='-',color=pitch, zorder=1,alpha=1)

        ## Pitch rectangle
        rec3 = plt.Rectangle((-1, -1), 70,106,ls='-',color=pitch, zorder=1,alpha=1)

        ax.add_artist(rec3)
        ax.add_artist(circle1)
        ax.add_artist(circle2)
        ax.add_artist(rec1)
        ax.add_artist(rec2)
        ax.add_artist(circle3)

# Data files from Pappalardo, L., Cintia, P., Rossi, A. et al
# tag2name = Information about 'Tag' code
tags2name = pd.read_csv("tags2name.csv")
eventid2name = pd.read_csv("eventid2name.csv")

tags2name.head(3)

# eventid2name = Information about 'event' code
eventid2name.head(3)

# df_players = Information about players
df_players.head(3)

# Data preprocessing for EPL team-code conversion (No need to run again)
# Filter only the EPL matches
epl = df_matches[df_matches['nation'] == 'England']

# Step 1: Extract team names from 'Label'
epl[['Home', 'Away']] = epl['label'].str.extract(r'(.+?) - (.+?),')

# Step 2: Extract team scores from 'label'
epl[['Home_score', 'Away_score']] = epl['label'].str.extract(r', (\d+) - (\d+)').astype(int)

# Step 3: Create a new DataFrame with 'code' and 'team'
team_id = epl.loc[epl['winner'] != 0, ['winner', 'Home', 'Away', 'Home_score', 'Away_score']].copy()
team_id['team'] = team_id.apply(
    lambda row: row['Home'] if row['Home_score'] > row['Away_score'] else row['Away'], axis=1
)
team_id = team_id[['winner', 'team']].rename(columns={'winner': 'code'}).drop_duplicates(subset='code').reset_index(drop=True)

# Save as .csv file
team_id.to_csv('team_id.csv', index=False)

# Read the team code file
team_id = pd.read_csv("team_id.csv")
team_id

# Data preprocessing
# Need to filter only the shot-events sequence rows
# 1. Find all the shots/goals row that were chances (opportunity)
# 2. Extract the two previous (contribution-assist) rows to combine those 3 rows as 1 shot-events sequence
# 3. Compress the 3 rows of sequence into one row

# List of all EPL match IDs
epl_match_id_list = df_matches[df_matches['nation'] == 'England']['wyId'].unique()
# Specific tag (opportunity)
target_tag_id = 201

# Initiate empty list to store the data
all_shot_sequences = []

# For loop to iterate through matches
for match_id in epl_match_id_list:
    # Get the events for the current nation
    events_list = events['England']
    
    # Iterate through the events, starting from the 3rd event to check previous 2 events
    for i in range(2, len(events_list)):
        ev = events_list[i]
        
        # Check if the current event is a 'Shot' event with 'Opportunity' condition
        if ev['matchId'] == match_id and ev['subEventName'] == 'Shot' and any(tag['id'] == target_tag_id for tag in ev['tags']):
            # Get the sequence of events (previous 2 + current)
            shot_sequence = events_list[i-2:i+1]
            
            # Check if all previous 2 events and the current shot event have valid position data
            valid_sequence = True
            for event in shot_sequence:
                # If any event has fewer than 2 position coordinates, set the flag to False
                if len(event['positions']) < 2:
                    valid_sequence = False
                    break  # If any event is invalid, no need to continue checking
            
            # Only append the sequence if it is valid
            if valid_sequence:
                all_shot_sequences.extend(shot_sequence)

# Convert into pandas dataframe
df_test = pd.DataFrame(all_shot_sequences)
df_test.head(3)

# Flip the spatial x & y coordinates
def extract_and_flip_coordinates(df):
    # Initiate empty lists to store the x & y coordinates
    x_coords, y_coords = [], []

    # Determine the number of sequences (3 rows per sequence)
    num_sequences = len(df) // 3 

    # Initiate the variables to store and update the maximum pitch size
    #max_x = 0
    #max_y = 0

    for seq in range(num_sequences):
        # Get the 3 rows for the current sequence
        events = df.iloc[seq * 3:(seq + 1) * 3].reset_index()
        
        # Initialize the summed numbers
        summed_x_1, summed_y_1, summed_x_2, summed_y_2 = None, None, None, None
        
        # Check mismatches and calculate summed numbers where necessary
        if events.loc[0, "teamId"] != events.loc[2, "teamId"]:  # First event mismatch
            summed_x_1 = events.loc[0, "positions"][1]['x'] + events.loc[1, "positions"][0]['x']
            summed_y_1 = events.loc[0, "positions"][1]['y'] + events.loc[1, "positions"][0]['y']
        
        if events.loc[1, "teamId"] != events.loc[2, "teamId"]:  # Second event mismatch
            summed_x_2 = events.loc[1, "positions"][1]['x'] + events.loc[2, "positions"][0]['x']
            summed_y_2 = events.loc[1, "positions"][1]['y'] + events.loc[2, "positions"][0]['y']

        # Adjust coordinates for all three events
        for i in range(3):
            if i == 0 and events.loc[0, "teamId"] != events.loc[2, "teamId"]:  # Flip for first event mismatch
                x_coords.append(summed_x_1 - events.loc[0, "positions"][0]['x'])
                y_coords.append(summed_y_1 - events.loc[0, "positions"][0]['y'])
            elif i == 1 and events.loc[1, "teamId"] != events.loc[2, "teamId"]:  # Flip for second event mismatch
                x_coords.append(summed_x_2 - events.loc[1, "positions"][0]['x'])
                y_coords.append(summed_y_2 - events.loc[1, "positions"][0]['y'])
            else:  # No flip needed
                x_coords.append(events.loc[i, "positions"][0]['x'])
                y_coords.append(events.loc[i, "positions"][0]['y'])

        # Check if all matches have same size of pitch

    return x_coords, y_coords

# Output the updated DataFrame
df_test['x'], df_test['y'] = extract_and_flip_coordinates(df_test)
df_test.head(3)

# Convert the Tag-Name dataframe into dictionary form
tags2name_dict = dict(zip(tags2name['Tag'], tags2name['Description']))

# Assuming 'tags' contains dictionaries with an 'id' key
df_test['tagLabels'] = df_test['tags'].apply(lambda tags: ', '.join([tags2name_dict[tag['id']] for tag in tags]))
df_test['tagsID'] = df_test['tags'].astype(str).str.findall(r'\d+').apply(lambda x: ', '.join(map(str, x)))

# Create a new variable to store which team is involved in the event
df_test['team'] = df_test.merge(team_id, left_on='teamId', right_on='code', how='left')['team']

# Create a new variable to store the actual player name from 'df_players'
df_test['player'] = df_test.merge(df_players, left_on='playerId', right_on='wyId', how='left')['shortName']

# Create a new variable to convert the event time into minutes
df_test['eventMin'] = (df_test['eventSec'] / 60).round(2)
df_test.head(3)

# Save as .csv file
df_test.to_csv("df_test.csv", index=False)

# Read the saved csv file
df_test = pd.read_csv("df_test.csv")
df_test.head(3)

# Next preprocessing - Pivoting
# Generate a sequence of numbers to represent the group of 3 events (shot sequence)
# Create a new DataFrame to store the transformed shot sequence rows
shot_sequences = []

# Iterate through the dataframe in steps of 3 rows at a time
for i in range(0, len(df_test), 3):
    # Get the current chunk of 3 rows (ensuring we don't go out of bounds)
    chunk = df_test.iloc[i:i+3]
    
    # If there are fewer than 3 rows left at the end of the DataFrame, skip it
    if len(chunk) < 3:
        continue

    # Create a new row for the shot sequence
    new_row = {}

    # Iterate over the columns (except matchId) and create new column names with '_1', '_2', '_3'
    for col in df_test.columns:
        if col == 'matchId':
            new_row[col] = chunk[col].iloc[0]  # All matchIds in the chunk should be the same
        else:
            for j in range(3):
                new_row[f"{col}_{j+1}"] = chunk[col].iloc[j]
    
    # Append the new row to the shot_sequences list
    shot_sequences.append(new_row)

# Convert the list of shot sequences into a new DataFrame
shot_sequence_df = pd.DataFrame(shot_sequences)
shot_sequence_df.head(3)

# Function for adding progression & event duration features
def add_progress_time_features(df, n_events=3):
    for i in range(1, n_events):
        # Event index suffix (e.g., 12 for event 1 to event 2)
        pair = f'{i}{i+1}'

        x_prev = f'x_{i}'
        x_curr = f'x_{i+1}'
        y_prev = f'y_{i}'
        y_curr = f'y_{i+1}'
        t_prev = f'eventSec_{i}'
        t_curr = f'eventSec_{i+1}'

        # 1. Horizontal progress distance
        shot_sequence_df[f'progress_dist_{pair}'] = shot_sequence_df[x_curr] - shot_sequence_df[x_prev]

        # 2. Euclidean distance
        euclid_dist = np.sqrt((shot_sequence_df[x_curr] - shot_sequence_df[x_prev])**2 + (shot_sequence_df[y_curr] - shot_sequence_df[y_prev])**2)

        # 3. Progress ratio (rounded to 2 decimals, replace NaNs with 0)
        ratio = shot_sequence_df[f'progress_dist_{pair}'] / euclid_dist
        shot_sequence_df[f'progress_ratio_{pair}'] = ratio.fillna(0).round(2)

        # 4. Time difference (rounded to 2 decimals)
        if t_prev in shot_sequence_df.columns and t_curr in shot_sequence_df.columns:
            shot_sequence_df[f'event_duration_{pair}'] = (shot_sequence_df[t_curr] - shot_sequence_df[t_prev]).round(2)
        else:
            shot_sequence_df[f'event_duration_{pair}'] = 0

    return shot_sequence_df

# Create another variable to see whether the team made the final shot is same as the team involved in the first and second event
shot_sequence_df['same_team_1'] = shot_sequence_df.apply(lambda row: 'same' if row['team_1'] == row['team_3'] else 'different', axis=1)
shot_sequence_df['same_team_2'] = shot_sequence_df.apply(lambda row: 'same' if row['team_2'] == row['team_3'] else 'different', axis=1)
shot_sequence_df['opponent_involve'] = shot_sequence_df.apply(lambda row: 'yes' if 'different' in [row['same_team_1'], row['same_team_2']] else 'no', axis=1)

# Variable to check whether the final shot made is accurate or not
shot_sequence_df['shot_accuracy'] = shot_sequence_df['tagLabels_3'].apply(lambda x: 'accurate' if 'Accurate' in x else ('not accurate' if 'Not accurate' in x else 'unknown'))

# Append progression & time features
shot_sequence_df = add_progress_time_features(shot_sequence_df)
shot_sequence_df.head(3)

# Save as .csv file
shot_sequence_df.to_csv("shot_sequence_df.csv", index=False)

# Read the saved csv file
shot_sequence_df = pd.read_csv("shot_sequence_df.csv")
shot_sequence_df.head(3)

# Extract only the spatial coordinates columns
spatial_data = shot_sequence_df[['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]

# Function to convert a row (pandas Series) to a sequence of 2D points
def row_to_sequence(row):
    # Create pairs of (x, y) coordinates for the 3 events
    return [(row[i], row[i+1]) for i in range(0, len(row), 2)]

# Convert the DataFrame rows into sequences of 2D points
sequences = spatial_data.apply(row_to_sequence, axis=1).tolist()
sequences[0:3]

[[(np.int64(61), np.int64(94)),
  (np.int64(89), np.int64(77)),
  (np.int64(88), np.int64(61))],
 [(np.int64(77), np.int64(31)),
  (np.int64(77), np.int64(31)),
  (np.int64(78), np.int64(38))],
 [(np.int64(66), np.int64(81)),
  (np.int64(81), np.int64(43)),
  (np.int64(87), np.int64(29))]]

# Import package for Frechet distance
from frechetdist import frdist

# Compute the pairwise Frechet distance matrix
n = len(sequences)
frechet_dist = np.zeros((n, n))

# Compute pairwise Frechet distances (Takes about 20 minutes)
for i in range(n):
    for j in range(i + 1, n):
        distance = frdist(sequences[i], sequences[j])
        frechet_dist[i, j] = distance
        frechet_dist[j, i] = distance

# Display the Frechet-distance matrix
frechet_dist

array([[ 0.        , 65.        , 32.01562119, ..., 63.95310782,
        56.0357029 , 56.8594759 ],
       [65.        ,  0.        , 51.19570294, ..., 14.31782106,
        23.34523506, 27.29468813],
       [32.01562119, 51.19570294,  0.        , ..., 50.11985634,
        42.63801121, 43.41658669],
       ...,
       [63.95310782, 14.31782106, 50.11985634, ...,  0.        ,
        11.40175425, 38.62641583],
       [56.0357029 , 23.34523506, 42.63801121, ..., 11.40175425,
         0.        , 37.65634077],
       [56.8594759 , 27.29468813, 43.41658669, ..., 38.62641583,
        37.65634077,  0.        ]], shape=(5916, 5916))

# Save the distance matrix as csv file
frechet_dist_df = pd.DataFrame(frechet_dist)
frechet_dist_df.to_csv("frechet_distance.csv", index=False)

# Read the saved csv file
frechet_dist = pd.read_csv("frechet_distance.csv")
dist_matrix = frechet_dist.values

# Import package for K-Medoids
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

# Define K values and random states
K_range = range(2, 21)
random_states = [207, 430, 437]

# Store average metrics
avg_cost = []
avg_sil = []

# Loop through each K
for k in K_range:
    temp_cost = []
    temp_sil = []

    # Run K-Medoids for each random state
    for rs in random_states:
        model = KMedoids(n_clusters=k, metric="precomputed", random_state=rs)
        labels = model.fit_predict(dist_matrix)

        # Save cost and silhouette
        temp_cost.append(model.inertia_)
        temp_sil.append(silhouette_score(dist_matrix, labels, metric="precomputed"))

    # Append average
    avg_cost.append(np.mean(temp_cost))
    avg_sil.append(np.mean(temp_sil))

# Plot both metrics
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Elbow Plot
axes[0].plot(K_range, avg_cost, marker='o')
axes[0].set_title('Average Elbow Plot (Frechet Distance Only)')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Cost (Inertia)')
axes[0].set_xticks(list(K_range))

# Silhouette Score Plot
axes[1].plot(K_range, avg_sil, marker='o', color='green')
axes[1].set_title('Average Silhouette Score (Frechet Distance Only)')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Score')
axes[1].set_xticks(list(K_range))

plt.tight_layout()
plt.show()

# Import package for Gower distance
from gower import gower_matrix

# Define the mixed features (categorical + numerical)
X_mix = shot_sequence_df[['subEventName_1', 'subEventName_2', 'opponent_involve', 'shot_accuracy',
                          'progress_dist_12', 'progress_ratio_12', 'event_duration_12', 
                          'progress_dist_23', 'progress_ratio_23', 'event_duration_23']]

# Convert the categorical data into Gower's distance matrix
gower_dist = gower_matrix(X_mix)
gower_dist

array([[0.        , 0.26367524, 0.2548754 , ..., 0.25610757, 0.2951537 ,
        0.47080293],
       [0.26367524, 0.        , 0.3389019 , ..., 0.0076088 , 0.25355783,
        0.41021228],
       [0.2548754 , 0.3389019 , 0.        , ..., 0.34637523, 0.39244175,
        0.516001  ],
       ...,
       [0.25610757, 0.0076088 , 0.34637523, ..., 0.        , 0.24606653,
        0.4177036 ],
       [0.2951537 , 0.25355783, 0.39244175, ..., 0.24606653, 0.        ,
        0.4182586 ],
       [0.47080293, 0.41021228, 0.516001  , ..., 0.4177036 , 0.4182586 ,
        0.        ]], shape=(5916, 5916), dtype=float32)

# Manual function to scale with MinMaxScaler with symmetric-preserving
# MinMax: Scale to a range of (0, 1)
def minmax_symmetrize(D):
    min_val = np.min(D)
    max_val = np.max(D)
    return (D - min_val) / (max_val - min_val)

# Scale both distance matrices
frechet_normalized = minmax_symmetrize(frechet_dist)
gower_normalized = minmax_symmetrize(gower_dist)

# Combine the matrices with equal weights
w_frechet = 0.5
w_gower = 0.5
D_final = np.array(w_frechet * frechet_normalized + w_gower * gower_normalized)

# Check the combined matrix
D_final

array([[0.        , 0.26955945, 0.21823385, ..., 0.26294744, 0.27876703,
        0.39998796],
       [0.26955945, 0.        , 0.30202867, ..., 0.02486983, 0.20542291,
        0.31795093],
       [0.21823385, 0.30202867, 0.        , ..., 0.30566024, 0.32687789,
        0.41242341],
       ...,
       [0.26294744, 0.02486983, 0.30566024, ..., 0.        , 0.18389493,
        0.33863858],
       [0.27876703, 0.20542291, 0.32687789, ..., 0.18389493, 0.        ,
        0.33768547],
       [0.39998796, 0.31795093, 0.41242341, ..., 0.33863858, 0.33768547,
        0.        ]], shape=(5916, 5916))

# Define K values and random states
K_range = range(2, 21)
random_states = [207, 430, 437]

# Store average metrics
avg_cost = []
avg_sil = []

# Loop through each K
for k in K_range:
    temp_cost = []
    temp_sil = []

    # Run K-Medoids for each random state
    for rs in random_states:
        model = KMedoids(n_clusters=k, metric="precomputed", random_state=rs)
        labels = model.fit_predict(D_final)

        # Save cost and silhouette
        temp_cost.append(model.inertia_)
        temp_sil.append(silhouette_score(D_final, labels, metric="precomputed"))

    # Append average
    avg_cost.append(np.mean(temp_cost))
    avg_sil.append(np.mean(temp_sil))

# Plot both metrics
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Elbow Plot
axes[0].plot(K_range, avg_cost, marker='o')
axes[0].set_title('Average Elbow Plot (Frechet + Gower Distance)')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Cost (Inertia)')
axes[0].set_xticks(list(K_range))

# Silhouette Score Plot
axes[1].plot(K_range, avg_sil, marker='o', color='green')
axes[1].set_title('Average Silhouette Score (Frechet + Gower Distance)')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Score')
axes[1].set_xticks(list(K_range))

plt.tight_layout()
plt.show()

# Plot both metrics (2 rows, 1 column layout)
fig, axes = plt.subplots(2, 1, figsize=(7, 10))

# Elbow Plot
axes[0].plot(K_range, avg_cost, marker='o')
axes[0].set_title('Average Elbow Plot (Frechet + Gower Distance)')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Cost (Inertia)')
axes[0].set_xticks(list(K_range))

# Silhouette Score Plot
axes[1].plot(K_range, avg_sil, marker='o', color='green')
axes[1].set_title('Average Silhouette Score (Frechet + Gower Distance)')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Score')
axes[1].set_xticks(list(K_range))

plt.tight_layout()
plt.show()

# Fit K-Medoids clustering to the distance matrix (Frechet distance only)
km7 = KMedoids(n_clusters=7, metric="precomputed", random_state=430)
km20 = KMedoids(n_clusters=20, metric="precomputed", random_state=430)

km7_pred = km7.fit_predict(D_final)
km20_pred = km20.fit_predict(D_final)

# Create 'Attack pattern'
shot_sequence_df['Attack_Frechet_Gower7'] = km7_pred
shot_sequence_df['Attack_Frechet_Gower20'] = km20_pred

# Check the created clustering labels
shot_sequence_df.head(3)

# Package for UMAP
import umap.umap_ as umap

# Use 'precomputed' metric
umap_model = umap.UMAP(metric='precomputed', random_state=430)
embedding = umap_model.fit_transform(D_final)

# Create the DataFrame
df_umap = pd.DataFrame({
    'UMAP1': embedding[:, 0],
    'UMAP2': embedding[:, 1],
    'Cluster_7': shot_sequence_df['Attack_Frechet_Gower7'],
    'Cluster_20': shot_sequence_df['Attack_Frechet_Gower20']
})

# Set up 1x3 subplots for UMAP visualizations
fig, axes = plt.subplots(1, 3, figsize=(21, 6), constrained_layout=True)

# 1. Raw UMAP
sns.scatterplot(data=df_umap, x='UMAP1', y='UMAP2', ax=axes[0], color='gray', s=10)
axes[0].set_title('UMAP (No Cluster Labels)')

# 2. UMAP with 7 clusters
sns.scatterplot(data=df_umap, x='UMAP1', y='UMAP2', hue='Cluster_7', palette='tab10', ax=axes[1], s=10)
axes[1].set_title('UMAP (7 Clusters)')

# Move legend outside
axes[1].legend(title='Cluster', bbox_to_anchor=(1, 1), loc='upper left')

# 3. UMAP with 20 clusters
sns.scatterplot(data=df_umap, x='UMAP1', y='UMAP2', hue='Cluster_20', palette='tab20', ax=axes[2], s=10)
axes[2].set_title('UMAP (20 Clusters)')

# Move and format legend (10 columns × 2 rows)
axes[2].legend(title='Cluster', bbox_to_anchor=(1, 1), loc='upper left', ncol=2)

# Set shared labels
for ax in axes:
    ax.set_xlabel('UMAP1')
    ax.set_ylabel('UMAP2')

plt.suptitle('UMAP Projections with Different Cluster Labels', fontsize=18, y=1.1)
plt.show()

# Set figure size
plt.figure(figsize=(7, 6))

# UMAP with 7 clusters
sns.scatterplot(data=df_umap, x='UMAP1', y='UMAP2', hue='Cluster_7', palette='tab10', s=10)

# Title and labels
plt.title('UMAP (7 Clusters)')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')

# Move legend outside
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# Save as .csv file
shot_sequence_df.to_csv("shot_sequence_df_cluster.csv", index=False)

# Load the saved csv file
shot_sequence_df = pd.read_csv("shot_sequence_df_cluster.csv")

# Create contingency table for k=7
contingency_frechet_gower7 = pd.crosstab(shot_sequence_df['team_3'], shot_sequence_df['Attack_Frechet_Gower7'])

# Compute total shots per team
total_sum = contingency_frechet_gower7.sum(axis=1)

# Compute proportions (row-wise)
proportions = contingency_frechet_gower7.div(total_sum, axis=0)

# Get max cluster and proportion
max_proportion = proportions.max(axis=1).round(3)
max_cluster = proportions.idxmax(axis=1).astype(int)

# Compute second-highest cluster (use argsort)
# argsort() gives sorted indices (ascending), so we pick second from last [-2]
second_max_cluster = proportions.apply(lambda row: row.sort_values(ascending=False).index[1], axis=1).astype(int)

# Combine results
contingency_frechet_gower7 = contingency_frechet_gower7.assign(
    Total_shots=total_sum,
    Max_cluster=max_cluster,
    Max2_cluster=second_max_cluster,
    Max_proportion=max_proportion
)

# Display the contingency table
contingency_frechet_gower7

# Define the grid size
rows, cols = 5, 4
fig, axes = plt.subplots(rows, cols, figsize=(20, 16))
fig.tight_layout(pad=4)

# Flatten the axes for easier iteration
axes = axes.flatten()

# Loop through each team and plot its histogram
for i, team in enumerate(contingency_frechet_gower7.index):
    ax = axes[i]
    
    # Exclude 'Total_Sum', 'Max_proportion', and 'Max_cluster' from plotting
    cluster_counts = contingency_frechet_gower7.loc[team].drop(['Total_shots', 'Max_proportion', 'Max_cluster', 'Max2_cluster'])
    
    # Compute relative frequencies
    relative_freq = cluster_counts / cluster_counts.sum()

    # Convert index to numeric
    cluster_labels = relative_freq.index.astype(int)  

    # Plot bar chart
    ax.bar(cluster_labels, relative_freq.values, color='dodgerblue')
    ax.set_title(f'{team}')
    ax.set_xlabel('Cluster label')
    ax.set_ylabel('Percentage')
    ax.set_xticks(cluster_labels)
    ax.set_xticklabels(cluster_labels, rotation=45)

# Add the main title
fig.suptitle("Distribution of Attack Pattern Cluster by Team (7 Clusters)", fontsize=16, fontweight='bold', y=1)
plt.show()

# Package for Chi-squared tes & Fisher's Exact test
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact

# Chi-squared test
_, p_value_chisq, _, _ = chi2_contingency(contingency_frechet_gower7)
print(f"P-value (Chi-squared): {p_value_chisq}")

# Fisher's Exact test
_, p_value_fisher = fisher_exact(contingency_frechet_gower7)
print(f"P-value (Fisher's Exact): {p_value_fisher}")

P-value (Chi-squared): 3.104035983995996e-07
P-value (Fisher's Exact): 0.0001

# Create contingency table for k=20
contingency_frechet_gower20 = pd.crosstab(shot_sequence_df['team_3'], shot_sequence_df['Attack_Frechet_Gower20'])

# Compute total shots per team
total_sum = contingency_frechet_gower20.sum(axis=1)

# Compute proportions (row-wise)
proportions = contingency_frechet_gower20.div(total_sum, axis=0)

# Get max cluster and proportion
max_proportion = proportions.max(axis=1).round(3)
max_cluster = proportions.idxmax(axis=1).astype(int)

# Compute second-highest cluster (use argsort)
# argsort() gives sorted indices (ascending), so we pick second from last [-2]
second_max_cluster = proportions.apply(lambda row: row.sort_values(ascending=False).index[1], axis=1).astype(int)

# Combine results
contingency_frechet_gower20 = contingency_frechet_gower20.assign(
    Total_shots=total_sum,
    Max_cluster=max_cluster,
    Max2_cluster=second_max_cluster,
    Max_proportion=max_proportion
)

# Expand the number of columns displayed in the table
pd.set_option('display.max_columns', None)
contingency_frechet_gower20

# Define the grid size
rows, cols = 5, 4
fig, axes = plt.subplots(rows, cols, figsize=(20, 16))
fig.tight_layout(pad=4)

# Flatten the axes for easier iteration
axes = axes.flatten()

# Loop through each team and plot its histogram
for i, team in enumerate(contingency_frechet_gower20.index):
    ax = axes[i]
    
    # Exclude 'Total_Sum', 'Max_proportion', and 'Max_cluster' from plotting
    cluster_counts = contingency_frechet_gower20.loc[team].drop(['Total_shots', 'Max_proportion', 'Max_cluster', 'Max2_cluster'])
    
    # Compute relative frequencies
    relative_freq = cluster_counts / cluster_counts.sum()

    # Convert index to numeric
    cluster_labels = relative_freq.index.astype(int)  

    # Plot bar chart
    ax.bar(cluster_labels, relative_freq.values, color='dodgerblue')
    ax.set_title(f'{team}')
    ax.set_xlabel('Cluster label')
    ax.set_ylabel('Percentage')
    ax.set_xticks(cluster_labels)
    ax.set_xticklabels(cluster_labels, rotation=45)

# Add the main title
fig.suptitle("Distribution of Attack Pattern Cluster by Team (20 Clusters)", fontsize=16, fontweight='bold', y=1)
plt.show()

# Chi-squared test
_, p_value_chisq, _, _ = chi2_contingency(contingency_frechet_gower20)
print(f"P-value (Chi-squared): {p_value_chisq}")

# Fisher's exact test
_, p_value_fisher = fisher_exact(contingency_frechet_gower20)
print(f"P-value (Fisher's Exact): {p_value_fisher}")

P-value (Chi-squared): 0.00011950401648507138
P-value (Fisher's Exact): 1.0

# Define the big 6 teams in EPL and filter only these teams' shot sequences
big6_teams = ['Manchester City', 'Manchester United', 'Liverpool', 'Arsenal', 'Chelsea', 'Tottenham Hotspur']
big6 = shot_sequence_df[shot_sequence_df['team_3'].isin(big6_teams)]

# Extract the attack pattern by cluster
manc_row0 = big6[(big6['team_3'] == 'Manchester City') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
manu_row0 = big6[(big6['team_3'] == 'Manchester United') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
tot_row0 = big6[(big6['team_3'] == 'Tottenham Hotspur') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
liv_row0 = big6[(big6['team_3'] == 'Liverpool') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
chel_row0 = big6[(big6['team_3'] == 'Chelsea') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
ars_row0 = big6[(big6['team_3'] == 'Arsenal') & (big6['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()

# Extract x and y coordinates for each of the 3 points
manc_x0 = [manc_row0['x_1'], manc_row0['x_2'], manc_row0['x_3']]
manc_y0 = [manc_row0['y_1'], manc_row0['y_2'], manc_row0['y_3']]

manu_x0 = [manu_row0['x_1'], manu_row0['x_2'], manu_row0['x_3']]
manu_y0 = [manu_row0['y_1'], manu_row0['y_2'], manu_row0['y_3']]

tot_x0 = [tot_row0['x_1'], tot_row0['x_2'], tot_row0['x_3']]
tot_y0 = [tot_row0['y_1'], tot_row0['y_2'], tot_row0['y_3']]

liv_x0 = [liv_row0['x_1'], liv_row0['x_2'], liv_row0['x_3']]
liv_y0 = [liv_row0['y_1'], liv_row0['y_2'], liv_row0['y_3']]

chel_x0 = [chel_row0['x_1'], chel_row0['x_2'], chel_row0['x_3']]
chel_y0 = [chel_row0['y_1'], chel_row0['y_2'], chel_row0['y_3']]

ars_x0 = [ars_row0['x_1'], ars_row0['x_2'], ars_row0['x_3']]
ars_y0 = [ars_row0['y_1'], ars_row0['y_2'], ars_row0['y_3']]

# Create a pitch
f = draw_pitch("#195905", "#f3efec", "h", "full")

# Plot each category
plt.plot(manc_x0, manc_y0, color='cyan', linestyle='-', linewidth=1, markersize=4, label="Man City")
plt.plot(manu_x0, manu_y0, color='red', linestyle='-', linewidth=1, markersize=4, label="Man United")
plt.plot(tot_x0, tot_y0, color='grey', linestyle='-', linewidth=1, markersize=4, label="Tottenham")
plt.plot(liv_x0, liv_y0, color='black', linestyle='-', linewidth=1, markersize=4, label="Liverpool")
plt.plot(chel_x0, chel_y0, color='dodgerblue', linestyle='-', linewidth=1, markersize=4, label="Chelsea")
plt.plot(ars_x0, ars_y0, color='yellow', linestyle='-', linewidth=1, markersize=4, label="Arsenal")

# Spatial Coordinates (Highlight the last event with star)
plt.text(manc_x0[0], manc_y0[0], '1', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.text(manc_x0[1], manc_y0[1], '2', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.scatter(manc_x0[2], manc_y0[2], marker='*', c='cyan', s=25, zorder=13)

plt.text(manu_x0[0], manu_y0[0], '1', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.text(manu_x0[1], manu_y0[1], '2', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.scatter(manu_x0[2], manu_y0[2], marker='*', c='red', s=25, zorder=13)

plt.text(tot_x0[0], tot_y0[0], '1', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.text(tot_x0[1], tot_y0[1], '2', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.scatter(tot_x0[2], tot_y0[2], marker='*', c='grey', s=25, zorder=13)

plt.text(liv_x0[0], liv_y0[0], '1', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.text(liv_x0[1], liv_y0[1], '2', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.scatter(liv_x0[2], liv_y0[2], marker='*', c='black', s=25, zorder=13)

plt.text(chel_x0[0], chel_y0[0], '1', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.text(chel_x0[1], chel_y0[1], '2', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.scatter(chel_x0[2], chel_y0[2], marker='*', c='dodgerblue', s=25, zorder=13)

plt.text(ars_x0[0], ars_y0[0], '1', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.text(ars_x0[1], ars_y0[1], '2', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.scatter(ars_x0[2], ars_y0[2], marker='*', c='yellow', s=25, zorder=13)

# Annotation arrow for attack direction
plt.annotate("", xy=(25, -3), xytext=(5, -5), arrowprops=dict(arrowstyle="->", linewidth=2))
plt.text(7, -4, 'Attack (---------->)', fontsize=15)
plt.title("Average Attack Sequence of Big 6 Teams (Cluster 0)")
plt.legend(title="Team", bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

# Table group by event combinations - Big 6 & Cluster 0
(shot_sequence_df[
        (shot_sequence_df['team_3'].isin(big6_teams)) &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 0)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 5')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Separate by Team - Big 6 & Cluster 0
(shot_sequence_df[(shot_sequence_df['team_3'].isin(big6_teams)) &
                  (shot_sequence_df['Attack_Frechet_Gower7'] == 0)]
    .groupby(['team_3', 'subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']]
    .sort_values(['team_3', 'count'], ascending=[True, False]))

# Extract the attack pattern by second max cluster
manc_row3 = big6[(big6['team_3'] == 'Manchester City') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
manu_row3 = big6[(big6['team_3'] == 'Manchester United') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
tot_row3 = big6[(big6['team_3'] == 'Tottenham Hotspur') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
liv_row3 = big6[(big6['team_3'] == 'Liverpool') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
chel_row3 = big6[(big6['team_3'] == 'Chelsea') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
ars_row3 = big6[(big6['team_3'] == 'Arsenal') & (big6['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()

# Extract x and y coordinates for each of the 3 points
manc_x3 = [manc_row3['x_1'], manc_row3['x_2'], manc_row3['x_3']]
manc_y3 = [manc_row3['y_1'], manc_row3['y_2'], manc_row3['y_3']]

manu_x3 = [manu_row3['x_1'], manu_row3['x_2'], manu_row3['x_3']]
manu_y3 = [manu_row3['y_1'], manu_row3['y_2'], manu_row3['y_3']]

tot_x3 = [tot_row3['x_1'], tot_row3['x_2'], tot_row3['x_3']]
tot_y3 = [tot_row3['y_1'], tot_row3['y_2'], tot_row3['y_3']]

liv_x3 = [liv_row3['x_1'], liv_row3['x_2'], liv_row3['x_3']]
liv_y3 = [liv_row3['y_1'], liv_row3['y_2'], liv_row3['y_3']]

chel_x3 = [chel_row3['x_1'], chel_row3['x_2'], chel_row3['x_3']]
chel_y3 = [chel_row3['y_1'], chel_row3['y_2'], chel_row3['y_3']]

ars_x3 = [ars_row3['x_1'], ars_row3['x_2'], ars_row3['x_3']]
ars_y3 = [ars_row3['y_1'], ars_row3['y_2'], ars_row3['y_3']]

# Create a pitch
f = draw_pitch("#195905", "#f3efec", "h", "full")

# Plot each category (Second Max Cluster)
plt.plot(manc_x3, manc_y3, color='cyan', linestyle='-', linewidth=1, markersize=4, label="Man City")
plt.plot(manu_x3, manu_y3, color='red', linestyle='-', linewidth=1, markersize=4, label="Man United")
plt.plot(tot_x3, tot_y3, color='grey', linestyle='-', linewidth=1, markersize=4, label="Tottenham")
plt.plot(liv_x3, liv_y3, color='black', linestyle='-', linewidth=1, markersize=4, label="Liverpool")
plt.plot(chel_x3, chel_y3, color='dodgerblue', linestyle='-', linewidth=1, markersize=4, label="Chelsea")
plt.plot(ars_x3, ars_y3, color='yellow', linestyle='-', linewidth=1, markersize=4, label="Arsenal")

# Spatial Coordinates (Highlight the last event with star)
plt.text(manc_x3[0], manc_y3[0], '1', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.text(manc_x3[1], manc_y3[1], '2', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.scatter(manc_x3[2], manc_y3[2], marker='*', c='cyan', s=25, zorder=13)

plt.text(manu_x3[0], manu_y3[0], '1', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.text(manu_x3[1], manu_y3[1], '2', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.scatter(manu_x3[2], manu_y3[2], marker='*', c='red', s=25, zorder=13)

plt.text(tot_x3[0], tot_y3[0], '1', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.text(tot_x3[1], tot_y3[1], '2', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.scatter(tot_x3[2], tot_y3[2], marker='*', c='grey', s=25, zorder=13)

plt.text(liv_x3[0], liv_y3[0], '1', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.text(liv_x3[1], liv_y3[1], '2', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.scatter(liv_x3[2], liv_y3[2], marker='*', c='black', s=25, zorder=13)

plt.text(chel_x3[0], chel_y3[0], '1', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.text(chel_x3[1], chel_y3[1], '2', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.scatter(chel_x3[2], chel_y3[2], marker='*', c='dodgerblue', s=25, zorder=13)

plt.text(ars_x3[0], ars_y3[0], '1', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.text(ars_x3[1], ars_y3[1], '2', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.scatter(ars_x3[2], ars_y3[2], marker='*', c='yellow', s=25, zorder=13)

# Annotation arrow for attack direction
plt.annotate("", xy=(25, -3), xytext=(5, -5), arrowprops=dict(arrowstyle="->", linewidth=2))
plt.text(7, -4, 'Attack (---------->)', fontsize=15)
plt.title("Average Attack Sequence of Big 6 Teams (Cluster 3)")
plt.legend(title="Big 6 Teams", bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

# Table group by event combinations - Big 6 & Cluster 3
(shot_sequence_df[
        (shot_sequence_df['team_3'].isin(big6_teams)) &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 3)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 5')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Separate by Team - Big 6 & Cluster 3
(shot_sequence_df[(shot_sequence_df['team_3'].isin(big6_teams)) &
                  (shot_sequence_df['Attack_Frechet_Gower7'] == 3)]
    .groupby(['team_3', 'subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']]
    .sort_values(['team_3', 'count'], ascending=[True, False]))

# Table group by event combinations - Big 6 & Cluster 0 / 3
(shot_sequence_df[
        (shot_sequence_df['team_3'].isin(big6_teams)) &
        ((shot_sequence_df['Attack_Frechet_Gower7'] == 0) | (shot_sequence_df['Attack_Frechet_Gower7'] == 3))
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 10')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Separate by Team - Big 6 & Cluster 0 / 3
(shot_sequence_df[(shot_sequence_df['team_3'].isin(big6_teams)) &
                  ((shot_sequence_df['Attack_Frechet_Gower7'] == 0) | (shot_sequence_df['Attack_Frechet_Gower7'] == 3))]
    .groupby(['team_3', 'subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .query('count >= 5')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']]
    .sort_values(['team_3', 'count'], ascending=[True, False]))

# Summary statistics table for Big 6 clubs (Cluster 0)
# Define numerical features
cols = ['progress_dist_12', 'progress_ratio_12', 
        'progress_dist_23', 'progress_ratio_23', 
        'event_duration_12', 'event_duration_23']

# Filter the data
filtered_df = shot_sequence_df[
    (shot_sequence_df['team_3'].isin(big6_teams)) &
    (shot_sequence_df['Attack_Frechet_Gower7'] == 0)
]

# Count observations per team
team_counts = filtered_df.groupby('team_3').size().rename("count")

# Group by and calculate mean and std
agg_df = filtered_df.groupby('team_3')[cols].agg(['mean', 'std'])

# Flatten the MultiIndex columns
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

# Create formatted DataFrame with "mean (std)"
formatted_df = pd.DataFrame(index=agg_df.index)

for col in cols:
    mean_col = f"{col}_mean"
    std_col = f"{col}_std"
    formatted_df[col] = agg_df[mean_col].round(2).astype(str) + " (" + agg_df[std_col].round(2).astype(str) + ")"

# Insert 'count' as the first column
formatted_df.insert(0, 'count', team_counts)

# Reset index and rename properly
formatted_df = formatted_df.reset_index().rename(columns={'team_3': 'Team'})
formatted_df.style.hide(axis='index')

# Summary statistics table for Big 6 clubs (Cluster 3)
# Filter the data
filtered_df = shot_sequence_df[
    (shot_sequence_df['team_3'].isin(big6_teams)) &
    (shot_sequence_df['Attack_Frechet_Gower7'] == 3)
]

# Count observations per team
team_counts = filtered_df.groupby('team_3').size().rename("count")

# Group by and calculate mean and std
agg_df = filtered_df.groupby('team_3')[cols].agg(['mean', 'std'])

# Flatten the MultiIndex columns
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

# Create formatted DataFrame with "mean (std)"
formatted_df = pd.DataFrame(index=agg_df.index)

for col in cols:
    mean_col = f"{col}_mean"
    std_col = f"{col}_std"
    formatted_df[col] = agg_df[mean_col].round(2).astype(str) + " (" + agg_df[std_col].round(2).astype(str) + ")"

# Insert 'count' as the first column
formatted_df.insert(0, 'count', team_counts)

# Reset index and rename properly
formatted_df = formatted_df.reset_index().rename(columns={'team_3': 'Team'})
formatted_df.style.hide(axis='index')

# Filter Leicester City & Comparison Group with relatively many goals (Everton / Crystal Palace / AFC Bournemouth)
leicester = shot_sequence_df[shot_sequence_df['team_3'] == 'Leicester City']
comparsion = shot_sequence_df[(shot_sequence_df['team_3'] == 'Everton') |
                              (shot_sequence_df['team_3'] == 'Crystal Palace') |
                              (shot_sequence_df['team_3'] == 'AFC Bournemouth')]

# Extract the attack pattern by cluster
leic_row5 = leicester[(leicester['Attack_Frechet_Gower7'] == 5)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()
comp_row3 = leicester[(leicester['Attack_Frechet_Gower7'] == 3)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()

# Extract x and y coordinates for each of the 3 points
leic_x5 = [leic_row5['x_1'], leic_row5['x_2'], leic_row5['x_3']]
leic_y5 = [leic_row5['y_1'], leic_row5['y_2'], leic_row5['y_3']]

comp_x3 = [comp_row3['x_1'], comp_row3['x_2'], comp_row3['x_3']]
comp_y3 = [comp_row3['y_1'], comp_row3['y_2'], comp_row3['y_3']]

# Create a pitch
f = draw_pitch("#195905", "#f3efec", "h", "full")

# Plot each category
plt.plot(leic_x5, leic_y5, color='white', linestyle='-', linewidth=1, markersize=4, label="Leicester City")
plt.plot(comp_x3, comp_y3, color='purple', linestyle='-', linewidth=1, markersize=4, label="Comparison")

# Spatial Coordinates (Highlight the last event with star)
plt.text(leic_x5[0], leic_y5[0], '1', fontsize=8, color='white', fontweight='bold', ha='right', va='top')
plt.text(leic_x5[1], leic_y5[1], '2', fontsize=8, color='white', fontweight='bold', ha='right', va='top')
plt.scatter(leic_x5[2], leic_y5[2], marker='*', c='white', s=25, zorder=13)

plt.text(comp_x3[0], comp_y3[0], '1', fontsize=8, color='purple', fontweight='bold', ha='right', va='top')
plt.text(comp_x3[1], comp_y3[1], '2', fontsize=8, color='purple', fontweight='bold', ha='right', va='top')
plt.scatter(comp_x3[2], comp_y3[2], marker='*', c='purple', s=25, zorder=13)

# Annotation arrow for attack direction
plt.annotate("", xy=(25, -3), xytext=(5, -5), arrowprops=dict(arrowstyle="->", linewidth=2))
plt.text(7, -4, 'Attack (---------->)', fontsize=15)
plt.title("Average Attack Sequence of Leicester City vs Comparison (Cluster 5 vs 3)")
plt.legend(title="Team", bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

# Table group by event combinations - Leicester City
(shot_sequence_df[
        (shot_sequence_df['team_3'] == 'Leicester City') &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 5)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Table group by event combinations - Everton / Crystal Palace / AFC Bournemouth
(shot_sequence_df[
        ((shot_sequence_df['team_3'] == 'Everton') |
         (shot_sequence_df['team_3'] == 'Crystal Palace') |
         (shot_sequence_df['team_3'] == 'AFC Bournemouth')) &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 3)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 5')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Define the big 6 teams in EPL and filter only these teams' shot sequences
leicester = shot_sequence_df[shot_sequence_df['team_3'] == 'Leicester City']

# Extract the attack pattern by cluster
leic_row0 = leicester[(leicester['Attack_Frechet_Gower7'] == 0)][['x_1', 'x_2', 'x_3', 'y_1', 'y_2', 'y_3']].mean()

# Extract x and y coordinates for each of the 3 points
leic_x0 = [leic_row0['x_1'], leic_row0['x_2'], leic_row0['x_3']]
leic_y0 = [leic_row0['y_1'], leic_row0['y_2'], leic_row0['y_3']]

# Create a pitch
f = draw_pitch("#195905", "#f3efec", "h", "full")

# Plot each category
plt.plot(leic_x0, leic_y0, color='white', linestyle='-', linewidth=1, markersize=4, label="Leicester City")

plt.plot(manc_x0, manc_y0, color='cyan', linestyle='-', linewidth=1, markersize=4, label="Man City")
plt.plot(manu_x0, manu_y0, color='red', linestyle='-', linewidth=1, markersize=4, label="Man United")
plt.plot(tot_x0, tot_y0, color='grey', linestyle='-', linewidth=1, markersize=4, label="Tottenham")
plt.plot(liv_x0, liv_y0, color='black', linestyle='-', linewidth=1, markersize=4, label="Liverpool")
plt.plot(chel_x0, chel_y0, color='dodgerblue', linestyle='-', linewidth=1, markersize=4, label="Chelsea")
plt.plot(ars_x0, ars_y0, color='yellow', linestyle='-', linewidth=1, markersize=4, label="Arsenal")

# Spatial Coordinates (Highlight the last event with star)
plt.text(leic_x0[0], leic_y0[0], '1', fontsize=8, color='white', fontweight='bold', ha='right', va='top')
plt.text(leic_x0[1], leic_y0[1], '2', fontsize=8, color='white', fontweight='bold', ha='right', va='top')
plt.scatter(leic_x0[2], leic_y0[2], marker='*', c='white', s=25, zorder=13)

plt.text(manc_x0[0], manc_y0[0], '1', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.text(manc_x0[1], manc_y0[1], '2', fontsize=8, color='cyan', fontweight='bold', ha='right', va='top')
plt.scatter(manc_x0[2], manc_y0[2], marker='*', c='cyan', s=25, zorder=13)

plt.text(manu_x0[0], manu_y0[0], '1', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.text(manu_x0[1], manu_y0[1], '2', fontsize=8, color='red', fontweight='bold', ha='right', va='top')
plt.scatter(manu_x0[2], manu_y0[2], marker='*', c='red', s=25, zorder=13)

plt.text(tot_x0[0], tot_y0[0], '1', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.text(tot_x0[1], tot_y0[1], '2', fontsize=8, color='grey', fontweight='bold', ha='right', va='top')
plt.scatter(tot_x0[2], tot_y0[2], marker='*', c='grey', s=25, zorder=13)

plt.text(liv_x0[0], liv_y0[0], '1', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.text(liv_x0[1], liv_y0[1], '2', fontsize=8, color='black', fontweight='bold', ha='right', va='top')
plt.scatter(liv_x0[2], liv_y0[2], marker='*', c='black', s=25, zorder=13)

plt.text(chel_x0[0], chel_y0[0], '1', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.text(chel_x0[1], chel_y0[1], '2', fontsize=8, color='dodgerblue', fontweight='bold', ha='right', va='top')
plt.scatter(chel_x0[2], chel_y0[2], marker='*', c='dodgerblue', s=25, zorder=13)

plt.text(ars_x0[0], ars_y0[0], '1', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.text(ars_x0[1], ars_y0[1], '2', fontsize=8, color='yellow', fontweight='bold', ha='right', va='top')
plt.scatter(ars_x0[2], ars_y0[2], marker='*', c='yellow', s=25, zorder=13)

# Annotation arrow for attack direction
plt.annotate("", xy=(25, -3), xytext=(5, -5), arrowprops=dict(arrowstyle="->", linewidth=2))
plt.text(7, -4, 'Attack (---------->)', fontsize=15)
plt.title("Average Attack Sequence of Leicester City vs Big 6 (Cluster 0)")
plt.legend(title="Team", bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

# Table group by event combinations
(shot_sequence_df[
        (shot_sequence_df['team_3'] == 'Leicester City') &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 0)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(count=('shot_accuracy', 'size'),
         goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
         accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)))
    .sort_values(by='count', ascending=False)
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate']])

# Filter for Leicester City, selected clusters
filtered_df = shot_sequence_df[
    (shot_sequence_df['team_3'] == 'Leicester City') &
    (shot_sequence_df['Attack_Frechet_Gower7'].isin([5, 0]))
]

# Count observations per cluster
cluster_counts = filtered_df.groupby('Attack_Frechet_Gower7').size().rename("count")

# Group by cluster and calculate mean and std
agg_df = filtered_df.groupby('Attack_Frechet_Gower7')[cols].agg(['mean', 'std'])

# Flatten MultiIndex column names
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]

# Format as "mean (std)"
formatted_df = pd.DataFrame(index=agg_df.index)

for col in cols:
    mean_col = f"{col}_mean"
    std_col = f"{col}_std"
    formatted_df[col] = agg_df[mean_col].round(2).astype(str) + " (" + agg_df[std_col].round(2).astype(str) + ")"

# Insert count column at the beginning
formatted_df.insert(0, 'count', cluster_counts)

# Optional: Rename index to make it clearer (e.g. "Cluster 0", "Cluster 5")
formatted_df.index.name = 'Cluster'
formatted_df.reset_index(inplace=True)
formatted_df.style.hide(axis='index')

# Function to extract the most & second most involved players
def top_two_modes(series):
    modes = series.mode()
    if len(modes) >= 2:
        return f"{modes.iloc[0]} / {modes.iloc[1]}"
    elif len(modes) == 1:
        return f"{modes.iloc[0]}"
    else:
        return None

# Summary table for key players - cluster 5
player_summary = (
    shot_sequence_df[
        (shot_sequence_df['team_3'] == 'Leicester City') &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 5)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(
        count=('shot_accuracy', 'size'),
        goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
        accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)),

        player_1=('player_1', top_two_modes),
        player_2=('player_2', top_two_modes),
        player_3=('player_3', top_two_modes)
    )
    .sort_values(by='count', ascending=False)
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate', 'player_1', 'player_2', 'player_3']]
)

# Display the table
player_summary

# Summary table for key players - cluster 0
player_summary = (
    shot_sequence_df[
        (shot_sequence_df['team_3'] == 'Leicester City') &
        (shot_sequence_df['Attack_Frechet_Gower7'] == 0)
    ]
    .groupby(['subEventName_1', 'subEventName_2'])
    .agg(
        count=('shot_accuracy', 'size'),
        goal_count=('tags_3', lambda x: x.str.contains(r'\b101\b', regex=True, na=False).sum()),
        accuracy_rate=('shot_accuracy', lambda x: (x == 'accurate').mean().round(3)),

        player_1=('player_1', top_two_modes),
        player_2=('player_2', top_two_modes),
        player_3=('player_3', top_two_modes)
    )
    .sort_values(by='count', ascending=False)
    .query('count >= 3')
    .assign(goal_percentage=lambda df: (df['goal_count'] / df['count']).round(2))
    .loc[:, ['count', 'goal_count', 'goal_percentage', 'accuracy_rate', 'player_1', 'player_2', 'player_3']]
)

# Display the table
player_summary

	event	subevent	event_label	subevent_label
0	1	10	Duel	Air duel
1	1	11	Duel	Ground attacking duel
2	1	12	Duel	Ground defending duel

	Unnamed: 0	passportArea	weight	firstName	middleName	lastName	currentTeamId	birthDate	height	role	birthArea	wyId	foot	shortName	currentNationalTeamId
0	0	{'name': 'Turkey', 'id': '792', 'alpha3code': ...	78	Harun	NaN	Tekin	4502.0	1989-06-17	187	{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...	{'name': 'Turkey', 'id': '792', 'alpha3code': ...	32777	right	H. Tekin	4687.0
1	1	{'name': 'Senegal', 'id': '686', 'alpha3code':...	73	Malang	NaN	Sarr	3775.0	1999-01-23	182	{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...	{'name': 'France', 'id': '250', 'alpha3code': ...	393228	left	M. Sarr	4423.0
2	2	{'name': 'France', 'id': '250', 'alpha3code': ...	72	Over	NaN	Mandanda	3772.0	1998-10-26	176	{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...	{'name': 'France', 'id': '250', 'alpha3code': ...	393230	NaN	O. Mandanda	NaN

	code	team
0	1659	AFC Bournemouth
1	1628	Crystal Palace
2	1609	Arsenal
3	1612	Liverpool
4	1611	Manchester United
5	1613	Newcastle United
6	1625	Manchester City
7	1639	Stoke City
8	1624	Tottenham Hotspur
9	1633	West Ham United
10	1631	Leicester City
11	1619	Southampton
12	1610	Chelsea
13	1644	Watford
14	1627	West Bromwich Albion
15	1651	Brighton & Hove Albion
16	1623	Everton
17	1646	Burnley
18	1673	Huddersfield Town
19	10531	Swansea City

	eventId	subEventName	tags	playerId	positions	matchId	eventName	teamId	matchPeriod	eventSec	subEventId	id
0	7	Touch	[{'id': 1302}]	8433	[{'y': 6, 'x': 39}, {'y': 23, 'x': 11}]	2500089	Others on the ball	1646	1H	1233.759219	72	251700446
1	8	Simple pass	[{'id': 302}, {'id': 1801}]	9739	[{'y': 77, 'x': 89}, {'y': 61, 'x': 88}]	2500089	Pass	1659	1H	1238.114351	85	251700603
2	10	Shot	[{'id': 402}, {'id': 201}, {'id': 1201}, {'id'...	245813	[{'y': 61, 'x': 88}, {'y': 100, 'x': 100}]	2500089	Shot	1659	1H	1239.333857	100	251700604

	eventId	subEventName	tags	playerId	positions	matchId	eventName	teamId	matchPeriod	eventSec	subEventId	id	x	y
0	7	Touch	[{'id': 1302}]	8433	[{'y': 6, 'x': 39}, {'y': 23, 'x': 11}]	2500089	Others on the ball	1646	1H	1233.759219	72	251700446	61	94
1	8	Simple pass	[{'id': 302}, {'id': 1801}]	9739	[{'y': 77, 'x': 89}, {'y': 61, 'x': 88}]	2500089	Pass	1659	1H	1238.114351	85	251700603	89	77
2	10	Shot	[{'id': 402}, {'id': 201}, {'id': 1201}, {'id'...	245813	[{'y': 61, 'x': 88}, {'y': 100, 'x': 100}]	2500089	Shot	1659	1H	1239.333857	100	251700604	88	61

STAT 430 Final Project - EPL Team Attack Pattern Sequence¶

Introduction & Motivation¶

Research Question¶

Why Shot sequence?¶

Data Preprocessing¶

Defining 'Attack Pattern'¶

What is attack pattern?¶

How many events?¶

Check-point¶

Adding New Features - Progression & Time Duration¶

Analysis 1 - Latent Variable with Unsupervised Learning¶

"Spatial-Only" Clustering¶

Frechet Distance¶

K-Medoids Clustering¶

Combining Categorical Features¶

Choosing Optimal Number of Clusters (K)¶

Dimensionality Reduction - UMAP¶

Analysis 2 - Translating Research Question into Mathematical Equation¶

Mathematical Question to Statistical Test¶

Chi-squared & Fisher's Exact Test¶

Hypothesis Testing¶

Further Analysis¶

1. Big 6 Teams' Attack Sequence¶

Combined Cluster Stats¶

Summary Statistics (Average) of Numerical Features¶

1. Cluster 0¶

2. Cluster 3¶

2. What makes Leicester City different?¶

Potential clue of scoring more goals than any other teams beside Big 6 clubs¶

Analysis of Cluster 5 & Comparsion Group¶

Analysis of Cluster 0 & Comparison with Big 6 Clubs¶

Summary Statistics Table - Leicester City¶

Key Players for Leicester City's Attack Sequences¶

References & Resources¶

Attack_Frechet_Gower7	0	1	2	3	4	5	6	Total_shots	Max_cluster	Max2_cluster	Max_proportion
team_3
AFC Bournemouth	51	42	31	54	30	45	32	285	3	0	0.189
Arsenal	101	36	60	79	34	33	53	396	0	3	0.255
Brighton & Hove Albion	36	30	24	43	46	37	35	251	4	3	0.183
Burnley	38	24	35	38	51	30	43	259	4	6	0.197
Chelsea	69	44	51	68	42	65	39	378	0	3	0.183
Crystal Palace	42	31	47	48	46	40	35	289	3	2	0.166
Everton	36	19	35	43	43	28	27	231	3	4	0.186
Huddersfield Town	33	29	29	44	38	39	22	234	3	5	0.188
Leicester City	48	27	36	35	45	51	27	269	5	0	0.190
Liverpool	89	35	65	78	43	73	35	418	0	3	0.213
Manchester City	126	42	82	88	41	53	35	467	0	3	0.270
Manchester United	65	28	45	57	24	49	38	306	0	3	0.212
Newcastle United	59	27	38	45	38	31	41	279	0	3	0.211
Southampton	50	30	35	55	37	43	34	284	3	0	0.194
Stoke City	46	26	37	36	46	31	30	252	0	4	0.183
Swansea City	32	9	37	29	27	30	29	193	2	0	0.192
Tottenham Hotspur	86	31	54	60	42	53	34	360	0	3	0.239
Watford	46	27	33	46	59	42	32	285	4	0	0.207
West Bromwich Albion	32	24	40	36	46	29	37	244	4	2	0.189
West Ham United	37	20	38	38	34	36	33	236	2	3	0.161

		count	goal_count	goal_percentage	accuracy_rate
subEventName_1	subEventName_2
Simple pass	Simple pass	156	31	0.20	1.0
	Smart pass	59	20	0.34	1.0
	Cross	51	29	0.57	1.0
	High pass	27	9	0.33	1.0
Ground attacking duel	Simple pass	22	8	0.36	1.0
Touch	Simple pass	16	6	0.38	1.0
Acceleration	Smart pass	15	8	0.53	1.0
Smart pass	Cross	14	8	0.57	1.0
Acceleration	Simple pass	13	4	0.31	1.0
High pass	Simple pass	12	5	0.42	1.0
Smart pass	Simple pass	12	9	0.75	1.0
Ground attacking duel	Cross	12	7	0.58	1.0
Ground attacking duel	Smart pass	11	5	0.45	1.0
Simple pass	Touch	10	0	0.00	1.0
Simple pass	Acceleration	9	0	0.00	1.0
Cross	Simple pass	7	4	0.57	1.0
High pass	Cross	5	4	0.80	1.0
Acceleration	Cross	5	2	0.40	1.0

Team	count	progress_dist_12	progress_ratio_12	progress_dist_23	progress_ratio_23	event_duration_12	event_duration_23
Arsenal	101	8.93 (13.44)	0.33 (0.58)	7.1 (11.42)	0.36 (0.52)	2.16 (1.45)	1.69 (0.82)
Chelsea	69	9.36 (17.64)	0.32 (0.55)	12.14 (13.3)	0.4 (0.44)	2.21 (1.28)	2.08 (1.33)
Liverpool	89	10.11 (14.75)	0.46 (0.52)	11.07 (13.05)	0.41 (0.46)	2.23 (1.33)	1.98 (1.01)
Manchester City	126	9.94 (14.82)	0.42 (0.58)	6.75 (13.08)	0.24 (0.49)	2.4 (1.22)	1.84 (0.93)
Manchester United	65	6.98 (13.4)	0.29 (0.51)	8.52 (11.53)	0.4 (0.43)	2.17 (1.36)	1.83 (0.9)
Tottenham Hotspur	86	11.45 (15.64)	0.43 (0.53)	8.35 (11.89)	0.31 (0.46)	2.68 (1.39)	1.9 (1.03)

Team	count	progress_dist_12	progress_ratio_12	progress_dist_23	progress_ratio_23	event_duration_12	event_duration_23
Arsenal	79	8.8 (12.19)	0.34 (0.55)	2.92 (9.91)	0.1 (0.46)	2.38 (1.73)	1.69 (0.85)
Chelsea	68	5.62 (18.77)	0.13 (0.58)	4.81 (14.79)	0.06 (0.52)	2.49 (2.35)	1.61 (1.02)
Liverpool	78	5.99 (14.95)	0.23 (0.58)	10.4 (14.14)	0.38 (0.51)	2.77 (3.01)	1.94 (1.02)
Manchester City	88	9.2 (15.41)	0.37 (0.58)	4.86 (11.72)	0.17 (0.48)	2.54 (1.38)	1.79 (0.79)
Manchester United	57	9.33 (16.74)	0.26 (0.63)	3.56 (11.14)	0.1 (0.48)	2.56 (2.01)	1.94 (0.92)
Tottenham Hotspur	60	7.43 (10.71)	0.34 (0.57)	10.55 (14.3)	0.33 (0.43)	2.65 (1.52)	1.93 (1.03)

Cluster	count	progress_dist_12	progress_ratio_12	progress_dist_23	progress_ratio_23	event_duration_12	event_duration_23
0	48	15.31 (14.09)	0.56 (0.45)	9.5 (12.13)	0.37 (0.48)	2.87 (1.46)	1.59 (0.88)
5	51	8.18 (33.78)	0.28 (0.62)	5.86 (19.04)	0.15 (0.51)	3.29 (10.68)	1.33 (0.85)

	Tag	Label	Description
0	101	Goal	Goal
1	102	own_goal	Own goal
2	301	assist	Assist