Revision | a84ddd6fd86cdf58bcf3c6889837eeee6b45dc5b (tree) |
---|---|
Time | 2015-03-20 06:36:08 |
Author | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A python code implementing a random forest for a Kaggle competition.
@@ -0,0 +1,45 @@ | ||
1 | +#! /usr/bin/env python | |
2 | + | |
3 | +""" | |
4 | +Beating the benchmark | |
5 | +Otto Group product classification challenge @ Kaggle | |
6 | + | |
7 | +__author__ : Abhishek Thakur | |
8 | +""" | |
9 | + | |
10 | +import pandas as pd | |
11 | +import numpy as np | |
12 | +from sklearn import ensemble, feature_extraction, preprocessing | |
13 | + | |
14 | +# import data | |
15 | +train = pd.read_csv('train.csv') | |
16 | +test = pd.read_csv('test.csv') | |
17 | +sample = pd.read_csv('sampleSubmission.csv') | |
18 | + | |
19 | +# drop ids and get labels | |
20 | +labels = train.target.values | |
21 | +train = train.drop('id', axis=1) | |
22 | +train = train.drop('target', axis=1) | |
23 | +test = test.drop('id', axis=1) | |
24 | + | |
25 | +# transform counts to TFIDF features | |
26 | +tfidf = feature_extraction.text.TfidfTransformer() | |
27 | +train = tfidf.fit_transform(train).toarray() | |
28 | +test = tfidf.transform(test).toarray() | |
29 | + | |
30 | +# encode labels | |
31 | +lbl_enc = preprocessing.LabelEncoder() | |
32 | +labels = lbl_enc.fit_transform(labels) | |
33 | + | |
34 | +# train a random forest classifier | |
35 | +clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=500) | |
36 | +clf.fit(train, labels) | |
37 | + | |
38 | +# predict on test set | |
39 | +preds = clf.predict_proba(test) | |
40 | + | |
41 | +# create submission file | |
42 | +preds = pd.DataFrame(preds, index=sample.id.values, columns=sample.columns[1:]) | |
43 | +preds.to_csv('benchmark.csv', index_label='id') | |
44 | + | |
45 | +print "So far so good" |