import collections
import itertools

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split

def read_libsvm_format(url):
    df = pd.read_csv(url, sep=' ', header=None)
    df = df.dropna(axis=1, how='all') # drop empty columns
    df = df.dropna() #drop all rows that have any NaN values
    target = df.iloc[:,0].values
    df = df.iloc[:,1:].applymap(lambda x: x.split(':')[1])
    df['target'] = target
    df = df.astype(float)    

    return df

def report(reg, model):
    best_result = reg.best_score_
    print('Best R^2 score of {} models: {}'.format(model, best_result))
    best_parameters = reg.best_params_
    print('Parameters of the best {} model: {}'.format(model, best_parameters))
    
def regression(X, y, regressor, grid_params, name):
    reg = GridSearchCV(regressor, grid_params, cv=5, verbose=0, n_jobs=-1)
    
    reg.fit(X, y)
    
    report(reg, name)

Reading Dataset¶

#breast = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/breast-cancer_scale' # class labels: 2, 4
#df = read_libsvm_format(diabetes, 2, 4)
#leukemia = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/leu.bz2' # class labels: -1, 1
#df = read_libsvm_format(diabetes, -1, 1)
body = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/bodyfat_scale'
df = read_libsvm_format(body)
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

Model and Parameter Search¶

svr_param_grid = [
    {
        'C': [1, 10, 100],
        'coef0': np.linspace(0, 1, 2), 'epsilon': np.linspace(0, 1, 3),        
        'kernel': ['poly', 'sigmoid']
    },
    {
        'C': [1, 10, 100],
        'epsilon': np.linspace(0, 1, 3),        
        'kernel': ['linear', 'rbf']
    },
]

svr = svm.SVR(gamma='scale')

regression(X, y, svr, svr_param_grid, 'SVR')

lr_param_grid = [
    {'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X':[True, False]}
]

lr = LinearRegression()

regression(X, y, lr, lr_param_grid, 'Linear Regression')

/home/hd/miniconda3/envs/svm/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)

Best R^2 score of SVR models: 0.976375461803825
Parameters of the best SVR model: {'C': 10, 'epsilon': 0.0, 'kernel': 'linear'}
Best R^2 score of Linear Regression models: 0.9727574884899195
Parameters of the best Linear Regression model: {'copy_X': True, 'fit_intercept': True, 'normalize': False}

/home/hd/miniconda3/envs/svm/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)