%pip install requirements.txt


# imports
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from getpass import getpass
from mysql.connector import connect
from sklearn import svm
from sklearn.linear_model import LinearRegression, LogisticRegression
import statsmodels.formula.api as sm
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler

# surpressing the "pandas only supports SQLAlchemy warning"
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)  
# full warning: 
# """ UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. 
# Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. """


mysql_username = input("Enter username: ")
mysql_password = getpass("Enter password: ")


# establish connection with MySQL Server
connection = connect(
    host="localhost",
    user=mysql_username,
    password=mysql_password
)
qry_create_db = "CREATE DATABASE wca"   # query to create an empty database called "wca"
cursor = connection.cursor()            # create the cursor
cursor.execute(qry_create_db)           # execute the query
cursor.close()                          # close the cursor
connection.close()                      # close the connection

mysql -u username -p password wca < filename.sql


!mysql -u root wca < WCA_export192_20230504T040001Z/WCA_export.sql


# establish the connection to our database "wca"
connection = connect(host="localhost", user=mysql_username, password=mysql_password, database="wca")

qry_top_50 = """
                SELECT * 
                FROM RanksSingle 
                WHERE eventID='333'     
                LIMIT 50 ;
            """

# execute the query and store the result in a dataframe
df_top_50 = pd.read_sql(qry_top_50, con=connection)
display(df_top_50)

connection.close() # close the connection


# using the Python `with` statement for easier resource management
with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_top_50_extra = """
                    SELECT Persons.id, Persons.name, Persons.countryId, Persons.gender, RanksSingle.best, RanksSingle.worldRank
                    FROM RanksSingle
                    INNER JOIN Persons ON RanksSingle.personID=Persons.id
                    WHERE RanksSingle.eventID='333'
                    ORDER BY RanksSingle.best
                    LIMIT 50 ;
                """

    df_top_50_extra = pd.read_sql(qry_top_50_extra, con=connection)
    display(df_top_50_extra)


display(df_top_50_extra[28:30])


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_2015lars04 = """
                    SELECT * 
                    FROM Persons
                    WHERE id = '2015LARS04' ;
                """
    df_2015lars04 = pd.read_sql(qry_2015lars04, con=connection)
    display(df_2015lars04)


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_top_50_extra = """
                    SELECT Persons.id, Persons.name, Persons.countryId, Persons.gender, RanksSingle.best, RanksSingle.worldRank
                    FROM RanksSingle
                    INNER JOIN Persons ON RanksSingle.personID=Persons.id
                    WHERE RanksSingle.eventID='333' 
                        AND Persons.subid=1
                    ORDER BY RanksSingle.best
                    LIMIT 50 ;
                """

    df_top_50_extra = pd.read_sql(qry_top_50_extra, con=connection)
    display(df_top_50_extra)


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    # querying the RanksAverage table instead of RanksSingle table 
    # in order to get competitor's best WCA average of five solves in a round
    qry_top_20_444_avg = """
                    SELECT Persons.id, Persons.name, Persons.countryId, Persons.gender, RanksAverage.best, RanksAverage.worldRank
                    FROM RanksAverage
                    INNER JOIN Persons ON RanksAverage.personID=Persons.id
                    WHERE RanksAverage.eventID='444' 
                        AND Persons.subid=1
                    ORDER BY RanksAverage.best
                    LIMIT 20 ;
                """
    df_top_20_444_avg = pd.read_sql(qry_top_20_444_avg, con=connection)
    display(df_top_20_444_avg)


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_top_20_sq1_avg = """
                    SELECT Persons.id, Persons.name, Persons.countryId, Persons.gender, RanksAverage.best, RanksAverage.worldRank
                    FROM RanksAverage
                    INNER JOIN Persons ON RanksAverage.personID=Persons.id
                    WHERE RanksAverage.eventID='sq1' 
                        AND Persons.subid=1
                        AND Persons.gender='f'
                    ORDER BY RanksAverage.best
                    LIMIT 20 ;
                """
    df_top_20_sq1_avg = pd.read_sql(qry_top_20_sq1_avg, con=connection)
    display(df_top_20_sq1_avg)


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_2022_country_comps = """
                        SELECT countryId, COUNT(countryId) as number_of_comps
                        FROM Competitions
                        WHERE Competitions.year = 2022
                        GROUP BY countryId
                        ORDER BY number_of_comps DESC
                        LIMIT 20
                    """
    df_2022_country_comps = pd.read_sql(qry_2022_country_comps, con=connection)
    display(df_2022_country_comps)


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    qry_333_singleWRs = """
                        SELECT Results.personName, 
                            Results.best,
                            Results.competitionId, 
                            Competitions.year,
                            Competitions.month,
                            Competitions.day
                        FROM Results
                        INNER JOIN Competitions ON Results.competitionId = Competitions.id
                        WHERE
                            Results.regionalSingleRecord = 'WR' AND
                            Results.eventId = '333'
                        ORDER BY Results.best DESC
                    """
    df_333_singleWRs = pd.read_sql(qry_333_singleWRs, con=connection)
    display(df_333_singleWRs)


df_333_singleWRs["best"] = df_333_singleWRs["best"].apply(lambda x: x/100)
df_333_singleWRs.tail()


df_333_singleWRs["date"] = pd.to_datetime(df_333_singleWRs[["year", "month", "day"]])
df_333_singleWRs.tail()


df_333_singleWRs[20:23]


df_333_singleWRs = df_333_singleWRs.sort_values(by=["date", "best"], ascending=[True, False])
df_333_singleWRs[20:23]


df_333_singleWRs.head()


hour = []                                       # new column to add to df
prev_row = df_333_singleWRs.iloc[0]             # prev row of our df (used when iterating)

# iterating through each row
for i, row in df_333_singleWRs.iterrows():
    if i == 0:                                  # base case
        hour.append(0)
    else:
        if prev_row["date"] == row["date"]:     # if date is same as the previous row
            hour.append(hour[-1] + 12)
        else:
            hour.append(0)
        prev_row = row

# add the new column
df_333_singleWRs["hour"] = hour
df_333_singleWRs


df_333_singleWRs["date"] = pd.to_datetime(df_333_singleWRs[["year", "month", "day", "hour"]])
df_333_singleWRs


sns.lineplot(data=df_333_singleWRs, y="best", x="date", marker="o")

<Axes: xlabel='date', ylabel='best'>


sns.lineplot(data=df_333_singleWRs, y="best", x="date", marker="o")

for date, best in zip(df_333_singleWRs["date"], df_333_singleWRs["best"]):
    label = f"({str(date.year)}, {best})"
    plt.annotate(label, (date, best), xytext=(3,3), textcoords="offset points", fontsize=6)


# converting to datetime objects date.ordinals since LinearRegression can not take datetime objects
df_333_singleWRs["ordinal"] = df_333_singleWRs["date"].apply(datetime.date.toordinal)

X = df_333_singleWRs[["ordinal"]]       # df of dates
y = df_333_singleWRs["best"]            # Series of results to match the dates
linreg = LinearRegression().fit(X, y)   # fit the linreg model

linreg_prediction = linreg.predict(X)                       # creating predicted y values using our linreg
df_333_singleWRs["predicted_linreg"] = linreg_prediction    # adding predicted values to our df

# plotting
sns.lineplot(data=df_333_singleWRs, y="best", x="date", marker="o")
sns.lineplot(data=df_333_singleWRs, x="date", y=linreg_prediction, color="#87bdff")

<Axes: xlabel='date', ylabel='best'>


sns.residplot(data=df_333_singleWRs, x="ordinal", y="best")

<Axes: xlabel='ordinal', ylabel='best'>


ols_model = sm.ols(formula="best ~ ordinal", data=df_333_singleWRs).fit()
print(ols_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   best   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     227.1
Date:                Sat, 13 May 2023   Prob (F-statistic):           2.36e-16
Time:                        23:28:11   Log-Likelihood:                -65.260
No. Observations:                  35   AIC:                             134.5
Df Residuals:                      33   BIC:                             137.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1239.0797     81.641     15.177      0.000    1072.980    1405.179
ordinal       -0.0017      0.000    -15.069      0.000      -0.002      -0.001
==============================================================================
Omnibus:                        7.722   Durbin-Watson:                   0.695
Prob(Omnibus):                  0.021   Jarque-Bera (JB):                6.267
Skew:                           0.945   Prob(JB):                       0.0436
Kurtosis:                       3.854   Cond. No.                     2.20e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.2e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


sns.regplot(data=df_333_singleWRs, x="ordinal", y="best", order=3, ci=None)

# changing x-axis tick labels from ordinals to year 
ax = plt.gca()              # get current ax
xticks = ax.get_xticks()    # get xticks
ax.set_xticklabels([datetime.date.fromordinal(int(x)).year for x in xticks])    # convert each ordinal to a year
ax.set_xlabel("year")       # changing x-axis label

Text(0.5, 0, 'year')


sns.residplot(data=df_333_singleWRs, x="ordinal", y="best", order=3)

<Axes: xlabel='ordinal', ylabel='best'>


sns.regplot(data=df_333_singleWRs, x="ordinal", y="best", order=4, ci=None)

# changing x-axis tick labels from ordinals to year 
ax = plt.gca()              # get current ax
xticks = ax.get_xticks()    # get xticks
ax.set_xticklabels([datetime.date.fromordinal(int(x)).year for x in xticks])    # convert each ordinal to a year
ax.set_xlabel("year")       # changing x-axis label

Text(0.5, 0, 'year')


sns.residplot(data=df_333_singleWRs, x="ordinal", y="best", order=4)

<Axes: xlabel='ordinal', ylabel='best'>


print("Polynomial of order 3")
ols_model_poly3 = sm.ols(formula="best ~ ordinal + I(ordinal**2) + I(ordinal**3)", data=df_333_singleWRs).fit()
print(ols_model_poly3.summary())
print("\n\n")

print("Polynomial of order 4")
ols_model_poly3 = sm.ols(formula="best ~ ordinal + I(ordinal**2) + I(ordinal**3) + I(ordinal**4)", data=df_333_singleWRs).fit()
print(ols_model_poly3.summary())

Polynomial of order 3
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   best   R-squared:                       0.875
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     231.1
Date:                Sat, 13 May 2023   Prob (F-statistic):           1.83e-16
Time:                        23:37:20   Log-Likelihood:                -64.991
No. Observations:                  35   AIC:                             134.0
Df Residuals:                      33   BIC:                             137.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        1.324e-20   8.53e-22     15.528      0.000    1.15e-20     1.5e-20
ordinal          6.456e-15   4.16e-16     15.528      0.000    5.61e-15     7.3e-15
I(ordinal ** 2)  2.362e-09   1.52e-10     15.528      0.000    2.05e-09    2.67e-09
I(ordinal ** 3) -3.198e-15   2.07e-16    -15.420      0.000   -3.62e-15   -2.78e-15
==============================================================================
Omnibus:                        7.381   Durbin-Watson:                   0.663
Prob(Omnibus):                  0.025   Jarque-Bera (JB):                5.938
Skew:                           0.936   Prob(JB):                       0.0514
Kurtosis:                       3.755   Cond. No.                     5.86e+24
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.59e-13. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


Polynomial of order 4
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   best   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.892
Method:                 Least Squares   F-statistic:                     140.8
Date:                Sat, 13 May 2023   Prob (F-statistic):           1.38e-16
Time:                        23:37:20   Log-Likelihood:                -61.447
No. Observations:                  35   AIC:                             128.9
Df Residuals:                      32   BIC:                             133.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        1.226e-20   8.64e-22     14.187      0.000    1.05e-20     1.4e-20
ordinal          5.999e-15   4.23e-16     14.187      0.000    5.14e-15    6.86e-15
I(ordinal ** 2)  2.187e-09   1.54e-10     14.187      0.000    1.87e-09     2.5e-09
I(ordinal ** 3) -2.959e-15    2.1e-16    -14.081      0.000   -3.39e-15   -2.53e-15
I(ordinal ** 4)  1.152e-19    4.3e-20      2.680      0.012    2.76e-20    2.03e-19
==============================================================================
Omnibus:                        3.361   Durbin-Watson:                   1.072
Prob(Omnibus):                  0.186   Jarque-Bera (JB):                2.044
Skew:                           0.485   Prob(JB):                        0.360
Kurtosis:                       3.679   Cond. No.                     1.08e+23
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.08e+23. This might indicate that there are
strong multicollinearity or other numerical problems.


df_333_singleWRs_reduction = df_333_singleWRs.copy()        # creating a copy of our original df
reduction = []                                              # list of reduction factors 
prev_row = df_333_singleWRs.iloc[0]                         # prev row of our df (used when iterating)

# iterating through df
for i, row in df_333_singleWRs_reduction.iterrows():
    if i == 0: 
        reduction.append(0.0)
    else:
        reduction.append(row["best"]/prev_row["best"])      # reduction factor = current WR / previous WR
    prev_row = row

# adding new column
df_333_singleWRs_reduction["reduction"] = reduction     
# rearranging columns and dropping ones we don't need
df_333_singleWRs_reduction = df_333_singleWRs_reduction[["personName", "best", "reduction", "competitionId", "date", "ordinal"]]
# dropping the first row since it's not relevant (N/A reduction factor since it's the first WR)
df_333_singleWRs_reduction = df_333_singleWRs_reduction.drop(index=0)
# reindex the df
df_333_singleWRs_reduction.reset_index(drop=True, inplace=True)
df_333_singleWRs_reduction


sns.lineplot(data=df_333_singleWRs_reduction, x="date", y="reduction", marker="o")

<Axes: xlabel='date', ylabel='reduction'>


ordinal_time = []                                       # new column to add to df
prev_row = df_333_singleWRs_reduction.iloc[0]           # prev row of our df (used when iterating)

# iterating through each row
for i, row in df_333_singleWRs_reduction.iterrows():
    if i == 0:                                          # base case
        ordinal_time.append(row["ordinal"])
    else:
        if prev_row["ordinal"] == row["ordinal"]:       # if ordinal is same as the previous row
            ordinal_time.append(row["ordinal"]+0.5)
        else:
            ordinal_time.append(row["ordinal"])
        prev_row = row

# add the new column
df_333_singleWRs_reduction["ordinal"] = ordinal_time
df_333_singleWRs_reduction


sns.regplot(data=df_333_singleWRs_reduction, x="ordinal", y="reduction", ci=None)

# changing x-axis tick labels from ordinals to year 
ax = plt.gca()              # get current ax
xticks = ax.get_xticks()    # get xticks
ax.set_xticklabels([datetime.date.fromordinal(int(x)).year for x in xticks])    # convert each ordinal to a year
ax.set_xlabel("year")       # changing x-axis label

Text(0.5, 0, 'year')


sns.regplot(data=df_333_singleWRs_reduction, x="ordinal", y="reduction", order=4, ci=None)

# changing x-axis tick labels from ordinals to year 
ax = plt.gca()              # get current ax
xticks = ax.get_xticks()    # get xticks
ax.set_xticklabels([datetime.date.fromordinal(int(x)).year for x in xticks])    # convert each ordinal to a year
ax.set_xlabel("year")       # changing x-axis label

Text(0.5, 0, 'year')


reduction_ols_model = sm.ols(formula="reduction ~ ordinal", data=df_333_singleWRs_reduction).fit()
print(reduction_ols_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              reduction   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.9353
Date:                Sat, 13 May 2023   Prob (F-statistic):              0.341
Time:                        23:27:56   Log-Likelihood:                 48.207
No. Observations:                  34   AIC:                            -92.41
Df Residuals:                      32   BIC:                            -89.36
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.1887      4.278     -0.745      0.461     -11.902       5.524
ordinal     5.637e-06   5.83e-06      0.967      0.341   -6.24e-06    1.75e-05
==============================================================================
Omnibus:                       24.687   Durbin-Watson:                   1.784
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               38.589
Skew:                          -1.937   Prob(JB):                     4.17e-09
Kurtosis:                       6.498   Cond. No.                     3.03e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.03e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


with connect(host="localhost", user=mysql_username, password=mysql_password, database="wca") as connection:
    tymon_id = '2016KOLA02'
    qry_tymon_333avgs = f"""
                    SELECT Results.competitionId, 
                        Results.roundTypeId, 
                        Results.average, 
                        Competitions.year, 
                        Competitions.month,
                        Competitions.day
                    FROM Results
                    INNER JOIN Persons ON Results.personID=Persons.id
                    INNER JOIN Competitions ON Results.competitionId=Competitions.id
                    WHERE Results.personId = '{tymon_id}'
                        AND Persons.subid = 1
                        AND Results.eventId = '333'
                        AND Results.average > 0 
                    ORDER BY Year ASC, Month ASC, Day ASC;
                """
    df_tymon_333avgs = pd.read_sql(qry_tymon_333avgs, con=connection)
    display(df_tymon_333avgs)

    # with pd.option_context('display.max_rows', None,):
    #     display(df_tymon_333avgs)


df_tymon_333avgs_groupby_comp = df_tymon_333avgs.groupby("competitionId")
modified_groups = []
for competition, group in df_tymon_333avgs_groupby_comp:
    hours = list(range(0, 24, 24//len(group)))
    hours.reverse()
    group["hour"] = hours
    modified_groups.append(group)

df_tymon_333avgs = pd.concat(modified_groups)
df_tymon_333avgs = df_tymon_333avgs.sort_values(by=["year", "month", "day", "hour"])
display(df_tymon_333avgs)
# with pd.option_context('display.max_rows', None,):
    # display(df_tymon_333avgs)


df_tymon_333avgs["date"] = pd.to_datetime(df_tymon_333avgs[["year", "month", "day", "hour"]])
df_tymon_333avgs


sns.lineplot(data=df_tymon_333avgs, x="date", y="average")

<Axes: xlabel='date', ylabel='average'>


# data preprocessing
scaler = MinMaxScaler()
data = scaler.fit_transform(df_tymon_333avgs["average"].values.reshape(-1,1))

# splitting data into training and validation sets
training_size = int(len(data) * 0.8)
training_data = data[:training_size, :]
validation_data = data[training_size:, :]

# print(training_data)
# print(validation_data)

# defining the RNN model architecture
n_steps = 3                             # number of time steps to consider for each input
n_features = 1                          # number of features in each input
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# training the RNN model 
# X_train, y_train = create_dataset(train_data, n_steps)
# X_val, y_val = create_dataset(val_data, n_steps)
# model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

	id	name	countryId	gender	best	worldRank
0	2015DUYU01	Yusheng Du (杜宇生)	China	m	347	1
1	2012PARK03	Max Park	USA	m	363	2
2	2016JONE04	Jode Brewster	Australia	m	388	3
3	2017KIMM01	Asher Kim-Magierek	USA	m	389	4
4	2019WANY36	Yiheng Wang (王艺衡)	China	m	390	5
5	2017GARR05	Luke Garrett	USA	m	395	6
6	2016KOLA02	Tymon Kolasiński	Poland	m	397	7
7	2017SIAU02	Max Siauw	USA	m	403	8
8	2017XURU04	Ruihang Xu (许瑞航)	China	m	406	9
9	2016INAB01	Matty Hiroto Inaba	USA	m	413	10
10	2009ZEMD01	Feliks Zemdegs	Australia	m	416	11
11	2016DEXT01	Riley Dexter	Australia	m	419	12
12	2012PONC02	Patrick Ponce	USA	m	424	13
13	2016SANT08	Kyle Santucci	Canada	m	426	14
14	2015BORR01	Leo Borromeo	Philippines	m	431	15
15	2010WEYE02	Sebastian Weyer	Germany	m	432	16
16	2016SATO01	Caio Hideaki Sato	Brazil	m	437	17
17	2018DULL01	Twan Dullemond	Netherlands	m	438	18
18	2015SANC11	Nicolás Sánchez	USA	m	438	18
19	2019REDI02	Dominic Redisi	USA	m	438	18
20	2015FUSH01	Firstian Fushada (符逢城)	Indonesia	m	442	21
21	2016SHEL03	Lukas Shelley	Denmark	m	442	21
22	2014SEBA01	Juliette Sébastien	France	f	444	23
23	2015MOHA10	Varun Mohanraj	USA	m	444	23
24	2015MILL01	Dylan Miller	USA	m	448	25
25	2015MACK06	Zeke Mackay	USA	m	448	25
26	2016LINB01	Brennen Lin	Canada	m	450	27
27	2021ZHAN01	Bofan Zhang (张博藩)	China	m	451	28
28	2015LARS04	Kim Roger Høyland Larsen	Norway	m	453	29
29	2015LARS04	Kim Roger Haraldsen	Norway	m	453	29
30	2015GRIE02	Luke Griesser	USA	m	454	30
31	2014MILL04	Chris Mills	United Kingdom	m	459	31
32	2010KIPA01	Jakub Kipa	Poland	m	459	31
33	2012CHOS01	SeungBeom Cho (조승범)	Korea	m	459	31
34	2010WANG68	Bill Wang	Canada	m	462	34
35	2015DELA05	Richard Delacoste	Switzerland	m	463	35
36	2013BALI01	Tanzer Balimtas	USA	m	464	36
37	2022NUNE01	Robert Yomi Cadenas Nuñez	Peru	m	465	37
38	2018OPAC01	Kajetan Opach	Poland	m	466	38
39	2013JOHN10	Brian Johnson	USA	m	468	39
40	2015CHER07	Tommy Cherry	USA	m	468	39
41	2012TYCK01	Luke Tycksen	USA	m	468	39
42	2018KHAN28	Zayn Khanani	USA	m	471	42
43	2014BONA02	Bautista Bonazzola	Argentina	m	472	43
44	2014XUZI01	Zibo Xu (徐子博)	China	m	472	43
45	2017WONG01	Brenton Angelo Lo Wong	Philippines	m	473	45
46	2007VALK01	Mats Valk	Netherlands	m	474	46
47	2016YENC01	Christopher Yen	USA	m	475	47
48	2012LEWI01	Phillip Lewicki	USA	m	475	47
49	2019SICH01	Oliver Michael Sitja Sichel	USA	m	475	47

	id	name	countryId	gender	best	worldRank
0	2015DUYU01	Yusheng Du (杜宇生)	China	m	347	1
1	2012PARK03	Max Park	USA	m	363	2
2	2016JONE04	Jode Brewster	Australia	m	388	3
3	2017KIMM01	Asher Kim-Magierek	USA	m	389	4
4	2019WANY36	Yiheng Wang (王艺衡)	China	m	390	5
5	2017GARR05	Luke Garrett	USA	m	395	6
6	2016KOLA02	Tymon Kolasiński	Poland	m	397	7
7	2017SIAU02	Max Siauw	USA	m	403	8
8	2017XURU04	Ruihang Xu (许瑞航)	China	m	406	9
9	2016INAB01	Matty Hiroto Inaba	USA	m	413	10
10	2009ZEMD01	Feliks Zemdegs	Australia	m	416	11
11	2016DEXT01	Riley Dexter	Australia	m	419	12
12	2012PONC02	Patrick Ponce	USA	m	424	13
13	2016SANT08	Kyle Santucci	Canada	m	426	14
14	2015BORR01	Leo Borromeo	Philippines	m	431	15
15	2010WEYE02	Sebastian Weyer	Germany	m	432	16
16	2016SATO01	Caio Hideaki Sato	Brazil	m	437	17
17	2018DULL01	Twan Dullemond	Netherlands	m	438	18
18	2015SANC11	Nicolás Sánchez	USA	m	438	18
19	2019REDI02	Dominic Redisi	USA	m	438	18
20	2015FUSH01	Firstian Fushada (符逢城)	Indonesia	m	442	21
21	2016SHEL03	Lukas Shelley	Denmark	m	442	21
22	2014SEBA01	Juliette Sébastien	France	f	444	23
23	2015MOHA10	Varun Mohanraj	USA	m	444	23
24	2015MILL01	Dylan Miller	USA	m	448	25
25	2015MACK06	Zeke Mackay	USA	m	448	25
26	2016LINB01	Brennen Lin	Canada	m	450	27
27	2021ZHAN01	Bofan Zhang (张博藩)	China	m	451	28
28	2015LARS04	Kim Roger Haraldsen	Norway	m	453	29
29	2015GRIE02	Luke Griesser	USA	m	454	30
30	2012CHOS01	SeungBeom Cho (조승범)	Korea	m	459	31
31	2014MILL04	Chris Mills	United Kingdom	m	459	31
32	2010KIPA01	Jakub Kipa	Poland	m	459	31
33	2010WANG68	Bill Wang	Canada	m	462	34
34	2015DELA05	Richard Delacoste	Switzerland	m	463	35
35	2013BALI01	Tanzer Balimtas	USA	m	464	36
36	2022NUNE01	Robert Yomi Cadenas Nuñez	Peru	m	465	37
37	2018OPAC01	Kajetan Opach	Poland	m	466	38
38	2012TYCK01	Luke Tycksen	USA	m	468	39
39	2013JOHN10	Brian Johnson	USA	m	468	39
40	2015CHER07	Tommy Cherry	USA	m	468	39
41	2018KHAN28	Zayn Khanani	USA	m	471	42
42	2014XUZI01	Zibo Xu (徐子博)	China	m	472	43
43	2014BONA02	Bautista Bonazzola	Argentina	m	472	43
44	2017WONG01	Brenton Angelo Lo Wong	Philippines	m	473	45
45	2007VALK01	Mats Valk	Netherlands	m	474	46
46	2016YENC01	Christopher Yen	USA	m	475	47
47	2019SICH01	Oliver Michael Sitja Sichel	USA	m	475	47
48	2012LEWI01	Phillip Lewicki	USA	m	475	47
49	2010BRAD01	Drew Brads	USA	m	476	50

	id	name	countryId	gender	best	worldRank
0	2012PARK03	Max Park	USA	m	1938	1
1	2010WEYE02	Sebastian Weyer	Germany	m	2146	2
2	2009ZEMD01	Feliks Zemdegs	Australia	m	2157	3
3	2013NAHM01	Seung Hyuk Nahm (남승혁)	Korea	m	2167	4
4	2016KOLA02	Tymon Kolasiński	Poland	m	2171	5
5	2016INAB01	Matty Hiroto Inaba	USA	m	2254	6
6	2015WANG09	Kai-Wen Wang (王楷文)	Taiwan	m	2261	7
7	2012PONC02	Patrick Ponce	USA	m	2307	8
8	2016NUNE11	Brandon Nunez	USA	m	2361	9
9	2017RAND06	Ari Randers-Pehrson	USA	m	2367	10
10	2012BEAH01	Ciarán Beahan	Ireland	m	2377	11
11	2015BORR01	Leo Borromeo	Philippines	m	2377	11
12	2017XURU04	Ruihang Xu (许瑞航)	China	m	2386	13
13	2019HUNG16	Đỗ Quang Hưng	Vietnam	m	2394	14
14	2010WANG68	Bill Wang	Canada	m	2397	15
15	2017SIAU02	Max Siauw	USA	m	2398	16
16	2015DELA05	Richard Delacoste	Switzerland	m	2440	17
17	2010WUIF01	Yi-Fan Wu (吳亦凡)	Taiwan	m	2487	18
18	2015FUSH01	Firstian Fushada (符逢城)	Indonesia	m	2509	19
19	2017GOLU01	Theo Goluboff	Argentina	m	2526	20

	id	name	countryId	gender	best	worldRank
0	2016CHEN53	Anshu Chennuru	USA	f	668	15
1	2013KATO01	Eva Kato	USA	f	770	52
2	2017MELL02	Laura Mellier	Switzerland	f	856	93
3	2014CHAN23	Sophie Chan	USA	f	911	132
4	2017PABI01	Magdalena Pabisz	Poland	f	916	140
5	2016SINN01	Eleanor Sinnott	United Kingdom	f	964	179
6	2013KLEI03	Livia Kleiner	USA	f	982	196
7	2015CALD02	Kymberlyn Calderon	USA	f	991	205
8	2015JOIN01	Kyra Joiner	USA	f	1055	267
9	2022THUY01	Lê Phương Thùy	Vietnam	f	1068	282
10	2011WIJA03	Angeline Wijaya (黃千儀)	Indonesia	f	1071	287
11	2021WARC01	Dominika Warchoł	Poland	f	1093	314
12	2015JINX01	Tina Xiangyan Jin (金香延)	China	f	1101	326
13	2018DILW02	Lacey Dilworth	USA	f	1115	343
14	2010HULL01	Katie Hull	USA	f	1121	349
15	2020SIBU02	Josephine Siburian	Australia	f	1127	359
16	2018GRAH05	Kate Grahame	USA	f	1149	380
17	2015TRAC01	Catherine Trace	USA	f	1183	422
18	2015ZHOU02	Lucinda Zhou	USA	f	1255	533
19	2022BERN01	Beatriz Hernandes Bernardes	Brazil	f	1316	654

	personName	best	competitionId	year	month	day
0	Minh Thai	2295	WC1982	1982	6	5
1	Dan Knights	1671	WC2003	2003	8	23
2	Jess Bonde	1653	WC2003	2003	8	23
3	Shotaro Makisumi (牧角章太郎)	1507	CaltechWinter2004	2004	1	24
4	Shotaro Makisumi (牧角章太郎)	1476	CaltechWinter2004	2004	1	24
5	Shotaro Makisumi (牧角章太郎)	1393	CaltechSpring2004	2004	4	3
6	Shotaro Makisumi (牧角章太郎)	1211	CaltechSpring2004	2004	4	3
7	Jean Pons	1175	DutchOpen2005	2005	10	16
8	Leyan Lo	1113	CaltechWinter2006	2006	1	14
9	Toby Mao (毛台立)	1048	US2006	2006	8	4
10	Edouard Chambon	1036	BelgianOpen2007	2007	2	24
11	Thibaut Jacquinot	986	SpanishOpen2007	2007	5	5
12	Erik Akkersdijk	977	DutchOpen2007	2007	10	13
13	Ron van Bruchem	955	Netherlands2007	2007	11	24
14	Edouard Chambon	918	MurciaOpen2008	2008	2	23
15	Yu Nakajima (中島悠)	872	KashiwaOpen2008	2008	5	5
16	Yu Nakajima (中島悠)	872	KashiwaOpen2008	2008	5	5
17	Erik Akkersdijk	708	CzechOpen2008	2008	7	12
18	Feliks Zemdegs	703	MelbourneCubeDay2010	2010	11	13
19	Feliks Zemdegs	677	MelbourneCubeDay2010	2010	11	13
20	Feliks Zemdegs	665	KubarooOpen2011	2011	5	7
21	Feliks Zemdegs	665	MelbourneSummer2011	2011	1	29
22	Feliks Zemdegs	624	KubarooOpen2011	2011	5	7
23	Feliks Zemdegs	618	MelbourneWinterOpen2011	2011	6	25
24	Feliks Zemdegs	566	MelbourneWinterOpen2011	2011	6	25
25	Mats Valk	555	ZonhovenOpen2013	2013	3	2
26	Collin Burns	525	DoylestownSpring2015	2015	4	25
27	Lucas Etter	490	RiverHillFall2015	2015	11	21
28	Mats Valk	474	JawaTimurOpen2016	2016	11	5
29	Feliks Zemdegs	473	POPSOpen2016	2016	12	11
30	Patrick Ponce	469	RallyInTheValley2017	2017	9	2
31	SeungBeom Cho (조승범)	459	Chicago2017	2017	10	28
32	Feliks Zemdegs	459	HobartSummer2018	2018	1	27
33	Feliks Zemdegs	422	CubeforCambodia2018	2018	5	6
34	Yusheng Du (杜宇生)	347	WuhuOpen2018	2018	11	24

Analysis of Competitive Rubik's Cube Solving¶

Liam Chen - CMSC320 Final Tutorial¶

Introduction¶

Data Collection & Setup¶

Queries¶

The Future of World Records¶

Improvements in World Records¶

Predicting the Future¶

Conclusion¶

	countryId	number_of_comps
0	USA	280
1	Australia	66
2	Poland	46
3	India	45
4	Canada	32
5	United Kingdom	32
6	Brazil	32
7	Spain	29
8	France	28
9	Colombia	24
10	Sweden	23
11	Denmark	22
12	Turkey	22
13	Italy	21
14	Bolivia	16
15	New Zealand	16
16	Indonesia	14
17	Peru	14
18	Germany	13
19	Norway	13

	personName	best	competitionId	year	month	day
30	Patrick Ponce	4.69	RallyInTheValley2017	2017	9	2
31	SeungBeom Cho (조승범)	4.59	Chicago2017	2017	10	28
32	Feliks Zemdegs	4.59	HobartSummer2018	2018	1	27
33	Feliks Zemdegs	4.22	CubeforCambodia2018	2018	5	6
34	Yusheng Du (杜宇生)	3.47	WuhuOpen2018	2018	11	24

	personName	best	competitionId	year	month	day	date
20	Feliks Zemdegs	6.65	KubarooOpen2011	2011	5	7	2011-05-07
21	Feliks Zemdegs	6.65	MelbourneSummer2011	2011	1	29	2011-01-29
22	Feliks Zemdegs	6.24	KubarooOpen2011	2011	5	7	2011-05-07

	personName	best	competitionId	year	month	day	date
0	Minh Thai	22.95	WC1982	1982	6	5	1982-06-05
1	Dan Knights	16.71	WC2003	2003	8	23	2003-08-23
2	Jess Bonde	16.53	WC2003	2003	8	23	2003-08-23
3	Shotaro Makisumi (牧角章太郎)	15.07	CaltechWinter2004	2004	1	24	2004-01-24
4	Shotaro Makisumi (牧角章太郎)	14.76	CaltechWinter2004	2004	1	24	2004-01-24

	personName	best	reduction	competitionId	date	ordinal
0	Dan Knights	16.71	0.728105	WC2003	2003-08-23 00:00:00	731450
1	Jess Bonde	16.53	0.989228	WC2003	2003-08-23 12:00:00	731450
2	Shotaro Makisumi (牧角章太郎)	15.07	0.911676	CaltechWinter2004	2004-01-24 00:00:00	731604
3	Shotaro Makisumi (牧角章太郎)	14.76	0.979429	CaltechWinter2004	2004-01-24 12:00:00	731604
4	Shotaro Makisumi (牧角章太郎)	13.93	0.943767	CaltechSpring2004	2004-04-03 00:00:00	731674
5	Shotaro Makisumi (牧角章太郎)	12.11	0.869347	CaltechSpring2004	2004-04-03 12:00:00	731674
6	Jean Pons	11.75	0.970273	DutchOpen2005	2005-10-16 00:00:00	732235
7	Leyan Lo	11.13	0.947234	CaltechWinter2006	2006-01-14 00:00:00	732325
8	Toby Mao (毛台立)	10.48	0.941599	US2006	2006-08-04 00:00:00	732527
9	Edouard Chambon	10.36	0.988550	BelgianOpen2007	2007-02-24 00:00:00	732731
10	Thibaut Jacquinot	9.86	0.951737	SpanishOpen2007	2007-05-05 00:00:00	732801
11	Erik Akkersdijk	9.77	0.990872	DutchOpen2007	2007-10-13 00:00:00	732962
12	Ron van Bruchem	9.55	0.977482	Netherlands2007	2007-11-24 00:00:00	733004
13	Edouard Chambon	9.18	0.961257	MurciaOpen2008	2008-02-23 00:00:00	733095
14	Yu Nakajima (中島悠)	8.72	0.949891	KashiwaOpen2008	2008-05-05 00:00:00	733167
15	Yu Nakajima (中島悠)	8.72	1.000000	KashiwaOpen2008	2008-05-05 12:00:00	733167
16	Erik Akkersdijk	7.08	0.811927	CzechOpen2008	2008-07-12 00:00:00	733235
17	Feliks Zemdegs	7.03	0.992938	MelbourneCubeDay2010	2010-11-13 00:00:00	734089
18	Feliks Zemdegs	6.77	0.963016	MelbourneCubeDay2010	2010-11-13 12:00:00	734089
19	Feliks Zemdegs	6.65	0.982275	MelbourneSummer2011	2011-01-29 00:00:00	734166
20	Feliks Zemdegs	6.65	1.000000	KubarooOpen2011	2011-05-07 00:00:00	734264
21	Feliks Zemdegs	6.24	0.938346	KubarooOpen2011	2011-05-07 12:00:00	734264
22	Feliks Zemdegs	6.18	0.990385	MelbourneWinterOpen2011	2011-06-25 00:00:00	734313
23	Feliks Zemdegs	5.66	0.915858	MelbourneWinterOpen2011	2011-06-25 12:00:00	734313
24	Mats Valk	5.55	0.980565	ZonhovenOpen2013	2013-03-02 00:00:00	734929
25	Collin Burns	5.25	0.945946	DoylestownSpring2015	2015-04-25 00:00:00	735713
26	Lucas Etter	4.90	0.933333	RiverHillFall2015	2015-11-21 00:00:00	735923
27	Mats Valk	4.74	0.967347	JawaTimurOpen2016	2016-11-05 00:00:00	736273
28	Feliks Zemdegs	4.73	0.997890	POPSOpen2016	2016-12-11 00:00:00	736309
29	Patrick Ponce	4.69	0.991543	RallyInTheValley2017	2017-09-02 00:00:00	736574
30	SeungBeom Cho (조승범)	4.59	0.978678	Chicago2017	2017-10-28 00:00:00	736630
31	Feliks Zemdegs	4.59	1.000000	HobartSummer2018	2018-01-27 00:00:00	736721
32	Feliks Zemdegs	4.22	0.919390	CubeforCambodia2018	2018-05-06 00:00:00	736820
33	Yusheng Du (杜宇生)	3.47	0.822275	WuhuOpen2018	2018-11-24 00:00:00	737022

	competitionId	roundTypeId	average	year	month	day
0	WLSWiosna2016	1	2752	2016	5	8
1	LodzCubingSummer2016	1	2437	2016	6	4
2	WLSLato2016	1	2133	2016	7	9
3	Euro2016	1	2049	2016	7	15
4	IIMasovianOpenPlock2016	2	1809	2016	8	27
...	...	...	...	...	...	...
318	PBsinPalisades2023	2	558	2023	4	15
319	PBsinPalisades2023	1	518	2023	4	15
320	CapeCod2023	f	582	2023	4	22
321	CapeCod2023	2	555	2023	4	22
322	CapeCod2023	1	560	2023	4	22