In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("../data/ign.csv")
In [3]:
print(df.info())
In [4]:
df = df.drop('title', axis=1)
df = df.drop('url', axis=1)
df = df.drop('Unnamed: 0', axis=1)
In [5]:
df = df.dropna()
In [6]:
print(df.info())
In [7]:
print(df.head())
In [8]:
from sklearn import preprocessing
In [9]:
le = preprocessing.LabelEncoder()
for col in df.columns.values:
#Encode only the categorical variables
if df[col].dtype=='object':
le.fit(df[col].values)
print("Encoded classes are: {}\n".format(le.classes_))
df[col]=le.transform(df[col])
In [10]:
print(df.head())
Keep in mind that even if score_phrase would normally be the feature to predict, based on the genre of the game, the score (maybe this one has even a direct correlation?), the release year, etc. it might be more interesting to try to use another feature has label. Just use something that makes sense :)
If needed, feel free to apply the knowledge you have already gathered to make changes to the dataset.
The goal of this exercise is to:
It's a good idea to use a random_state equal to some integer in order to replicate results.
Remember, the goal is too get acquainted with this kind of procedures. Don't stress too much with high scores. If you remember anything else you would like to try, feel free to implement it!
In [11]:
# Now it's your turn
In [ ]: