technical-knockout.com : Machine Learning

Machine Learning A-Z: Part 2 – Regression (Random Forest Regression)

Random Forest Intuition

Ensemble Learning

STEP 1: Pick at random K data points from the Training set.

STEP 2: Build the Decision Tree associated to these K data points.

STEP 3: Choose the number Ntree of trees you want to build and repeat STEPS 1 & 2.

STEP 4: For a new data point, make each one of your Ntree trees predict the value of Y to for the data point in question, and assign the new data point the average across all of the predicted Y values.

e.g. A wild guessing game using a jar with jellybeans in it.
Calculate the average of many wild guesses.

Random Forest Regression

Python

# Random Forest Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)

# Visualising the Random Forest Regression results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Random Forest Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Random Forest Regression

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('Position_Salaries.csv')

X = dataset.iloc[:, 1:2].values

y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set

"""from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling

"""from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)

X_test = sc_X.transform(X_test)

sc_y = StandardScaler()

y_train = sc_y.fit_transform(y_train)"""

# Fitting the Random Forest Regression to the dataset

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)

regressor.fit(X, y)

# Predicting a new result

y_pred = regressor.predict(6.5)

# Visualising the Random Forest Regression results (for higher resolution and smoother curve)

X_grid = np.arange(min(X), max(X), 0.01)

X_grid = X_grid.reshape((len(X_grid), 1))

plt.scatter(X, y, color = 'red')

plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')

plt.title('Truth or Bluff (Random Forest Regression)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Random Forest Regression

# Importing the dataset
dataset = read.csv('Position_Salaries.csv')
dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set
# # install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$Salary, SplitRatio = 2/3)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting the Random Forest Regression to the dataset
# install.packages('randomForest')
library(randomForest)
set.seed(1234)
regressor = randomForest(x = dataset[1], 
                         y = dataset$Salary,
                         Ntree = 500)

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Random Forest Regression results (for higher resolution and smoother curve)
# install.packages('ggplot2')
library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Random Forest Regression)') +
  xlab('Level') +
  ylab('Salary')

# Random Forest Regression

# Importing the dataset

dataset = read.csv('Position_Salaries.csv')

dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set

# # install.packages('caTools')

# library(caTools)

# set.seed(123)

# split = sample.split(dataset$Salary, SplitRatio = 2/3)

# training_set = subset(dataset, split == TRUE)

# test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting the Random Forest Regression to the dataset

# install.packages('randomForest')

library(randomForest)

set.seed(1234)

regressor = randomForest(x = dataset[1],

y = dataset$Salary,

Ntree = 500)

# Predicting a new result

y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Random Forest Regression results (for higher resolution and smoother curve)

# install.packages('ggplot2')

library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

colour = 'red') +

geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),

colour = 'blue') +

ggtitle('Truth or Bluff (Random Forest Regression)') +

xlab('Level') +

ylab('Salary')

Machine Learning A-Z: Part 2 – Regression (Decision Tree Regression)

Decision Tree Intuition

CART (Classification and Regression Trees)
– Classification Trees
– Regression Trees

Splitting data into segments.
Split 1: X₁ < 20
Split 2: X₂ < 200
Split 3: X₂ < 170
Split 4: X₁ < 40

Decision Tree Regression

Python

# Decision Tree Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Decision Tree Regression to the dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)

# Visualizing the Decision Tree Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the Decision Tree Regression results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Decision Tree Regression

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('Position_Salaries.csv')

X = dataset.iloc[:, 1:2].values

y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set

"""from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling

"""from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)

X_test = sc_X.transform(X_test)

sc_y = StandardScaler()

y_train = sc_y.fit_transform(y_train)"""

# Fitting the Decision Tree Regression to the dataset

from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)

regressor.fit(X, y)

# Predicting a new result

y_pred = regressor.predict(6.5)

# Visualizing the Decision Tree Regression results

plt.scatter(X, y, color = 'red')

plt.plot(X, regressor.predict(X), color = 'blue')

plt.title('Truth or Bluff (Decision Tree Regression)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Visualising the Decision Tree Regression results (for higher resolution and smoother curve)

X_grid = np.arange(min(X), max(X), 0.01)

X_grid = X_grid.reshape((len(X_grid), 1))

plt.scatter(X, y, color = 'red')

plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')

plt.title('Truth or Bluff (Decision Tree Regression)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Decision Tree Regression

# Importing the dataset
dataset = read.csv('Position_Salaries.csv')
dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting the Decision Tree Regression to the dataset
# install.packages('rpart')
# library(rpart)
regressor = rpart(formula = Salary ~ .,
                  data = dataset,
                  control = rpart.control(minsplit = 1))

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Decision Tree Regression results
# install.packages('ggplot2')
library(ggplot2)

ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),
            color = 'blue') +
  ggtitle('Truth or Bluff (Decision Tree Regression)') +
  xlab('Level') +
  ylab('Salary')

# Visualising the Decision Tree Regression results (for higher resolution and smoother curve)
# install.packages('ggplot2')
# library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)
ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            color = 'blue') +
  ggtitle('Truth or Bluff (Decision Tree Regression)') +
  xlab('Level') +
  ylab('Salary')

# Decision Tree Regression

# Importing the dataset

dataset = read.csv('Position_Salaries.csv')

dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

# library(caTools)

# set.seed(123)

# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)

# training_set = subset(dataset, split == TRUE)

# test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting the Decision Tree Regression to the dataset

# install.packages('rpart')

# library(rpart)

regressor = rpart(formula = Salary ~ .,

data = dataset,

control = rpart.control(minsplit = 1))

# Predicting a new result

y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Decision Tree Regression results

# install.packages('ggplot2')

library(ggplot2)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),

color = 'blue') +

ggtitle('Truth or Bluff (Decision Tree Regression)') +

xlab('Level') +

ylab('Salary')

# Visualising the Decision Tree Regression results (for higher resolution and smoother curve)

# install.packages('ggplot2')

# library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),

color = 'blue') +

ggtitle('Truth or Bluff (Decision Tree Regression)') +

xlab('Level') +

ylab('Salary')

Machine Learning A-Z: Part 2 – Regression (SVR)

Support Vector Regression (SVR)

Python

# SVR

# Regression Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))

# Visualizing the SVR results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualizing the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# SVR

# Regression Template

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('Position_Salaries.csv')

X = dataset.iloc[:, 1:2].values

y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set

"""from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

sc_y = StandardScaler()

X = sc_X.fit_transform(X)

y = sc_y.fit_transform(y)

# Fitting SVR to the dataset

from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')

regressor.fit(X, y)

# Predicting a new result

y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))

# Visualizing the SVR results

plt.scatter(X, y, color = 'red')

plt.plot(X, regressor.predict(X), color = 'blue')

plt.title('Truth or Bluff (SVR)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Visualizing the SVR results (for higher resolution and smoother curve)

X_grid = np.arange(min(X), max(X), 0.1)

X_grid = X_grid.reshape((len(X_grid), 1))

plt.scatter(X, y, color = 'red')

plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')

plt.title('Truth or Bluff (SVR)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Regression Template

# Importing the dataset
dataset = read.csv('Position_Salaries.csv')
dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting the SVR to the dataset
# install.packages('e1071')
# library(e1071)
regressor = svm(formula = Salary ~ .,
                data = dataset,
                type = 'eps-regression')

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the SVR results
# install.packages('ggplot2')
# library(ggplot2)

ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),
            color = 'blue') +
  ggtitle('Truth or Bluff (SVR Model)') +
  xlab('Level') +
  ylab('Salary')

# Regression Template

# Importing the dataset

dataset = read.csv('Position_Salaries.csv')

dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

# library(caTools)

# set.seed(123)

# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)

# training_set = subset(dataset, split == TRUE)

# test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting the SVR to the dataset

# install.packages('e1071')

# library(e1071)

regressor = svm(formula = Salary ~ .,

data = dataset,

type = 'eps-regression')

# Predicting a new result

y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the SVR results

# install.packages('ggplot2')

# library(ggplot2)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),

color = 'blue') +

ggtitle('Truth or Bluff (SVR Model)') +

xlab('Level') +

ylab('Salary')

Machine Learning A-Z: Part 2 – Regression (Polynomial Regression)

Polynomial Regression

Python

# Polynomial Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting Linear Regression to the dataset
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
poly_reg.fit(X_poly, y)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

# Visulalizing the Linear Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg.predict(X), color = 'blue')
plt.title('Truth or Bluff (Linear Regression)')
plt.xlabel('Position level')
plt.show()

# Visualizing the Polynomial Regression results
plt.scatter(X, y, color = 'red')

# lineal regression
# plt.plot(X, lin_reg.predict(X), color = 'blue')

# simple polynomial
# plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')

# more accurate polynomial
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.plot(X_grid, lin_reg_2.predict(poly_reg.fit_transform(X_grid)), color = 'blue')

plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Predicting a new result with Linear Regression
lin_reg.predict(6.5)

# Predicting a new result with Polynomial Regression
lin_reg_2.predict(poly_reg.fit_transform(6.5))

# Polynomial Regression

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('Position_Salaries.csv')

X = dataset.iloc[:, 1:2].values

y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set

"""from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling

"""from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)

X_test = sc_X.transform(X_test)

sc_y = StandardScaler()

y_train = sc_y.fit_transform(y_train)"""

# Fitting Linear Regression to the dataset

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(X, y)

# Fitting Polynomial Regression to the dataset

from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 4)

X_poly = poly_reg.fit_transform(X)

poly_reg.fit(X_poly, y)

lin_reg_2 = LinearRegression()

lin_reg_2.fit(X_poly, y)

# Visulalizing the Linear Regression results

plt.scatter(X, y, color = 'red')

plt.plot(X, lin_reg.predict(X), color = 'blue')

plt.title('Truth or Bluff (Linear Regression)')

plt.xlabel('Position level')

plt.show()

# Visualizing the Polynomial Regression results

plt.scatter(X, y, color = 'red')

# lineal regression

# plt.plot(X, lin_reg.predict(X), color = 'blue')

# simple polynomial

# plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')

# more accurate polynomial

X_grid = np.arange(min(X), max(X), 0.1)

X_grid = X_grid.reshape((len(X_grid), 1))

plt.plot(X_grid, lin_reg_2.predict(poly_reg.fit_transform(X_grid)), color = 'blue')

plt.title('Truth or Bluff (Polynomial Regression)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Predicting a new result with Linear Regression

lin_reg.predict(6.5)

# Predicting a new result with Polynomial Regression

lin_reg_2.predict(poly_reg.fit_transform(6.5))

Reset console.

from IPython import get_ipython
get_ipython().magic('reset -sf')

def __reset__(): get_ipython().magic('reset -sf')

from IPython import get_ipython

get_ipython().magic('reset -sf')

def __reset__(): get_ipython().magic('reset -sf')

IPythonコンソール|カーネルの再起動

Show summary.

summary(poly_reg)

1	summary(poly_reg)

# Polynomial Regression

# Importing the dataset
dataset = read.csv('Position_Salaries.csv')
dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting Linear Regression to the dataset
lin_reg = lm(formula = Salary ~ .,
             data = dataset)

# Fitting Polynomial Regression to the dataset
dataset$Level2 = dataset$Level^2
dataset$Level3 = dataset$Level^3
dataset$Level4 = dataset$Level^4
poly_reg = lm(formula = Salary ~ .,
              data = dataset)

# Visualising the Linear Regression results
# install.packages('ggplot2')
# library(ggplot2)
ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)),
            color = 'blue') +
  ggtitle('Truth or Bluff (Linear Regression)') +
  xlab('Level') +
  ylab('Salary')

# Visualising the Polynomial Regression results
ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)),
            color = 'blue') +
  ggtitle('Truth or Bluff (Polynomial Regression)') +
  xlab('Level') +
  ylab('Salary')

# Predicting a new result with Linear Regression
y_pred = predict(lin_reg, data.frame(Level = 6.5))

# Predicting a new result with Polynomial Regression
y_pred = predict(poly_reg, data.frame(Level = 6.5,
                                      Level2 = 6.5^2,
                                      Level3 = 6.5^3,
                                      Level4 = 6.5^4))

# Polynomial Regression

# Importing the dataset

dataset = read.csv('Position_Salaries.csv')

dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

# library(caTools)

# set.seed(123)

# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)

# training_set = subset(dataset, split == TRUE)

# test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting Linear Regression to the dataset

lin_reg = lm(formula = Salary ~ .,

data = dataset)

# Fitting Polynomial Regression to the dataset

dataset$Level2 = dataset$Level^2

dataset$Level3 = dataset$Level^3

dataset$Level4 = dataset$Level^4

poly_reg = lm(formula = Salary ~ .,

data = dataset)

# Visualising the Linear Regression results

# install.packages('ggplot2')

# library(ggplot2)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)),

color = 'blue') +

ggtitle('Truth or Bluff (Linear Regression)') +

xlab('Level') +

ylab('Salary')

# Visualising the Polynomial Regression results

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)),

color = 'blue') +

ggtitle('Truth or Bluff (Polynomial Regression)') +

xlab('Level') +

ylab('Salary')

# Predicting a new result with Linear Regression

y_pred = predict(lin_reg, data.frame(Level = 6.5))

# Predicting a new result with Polynomial Regression

y_pred = predict(poly_reg, data.frame(Level = 6.5,

Level2 = 6.5^2,

Level3 = 6.5^3,

Level4 = 6.5^4))

Templates

Python

# Regression Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Regression to the dataset
# Create your regressor here

# Predicting a new result
y_pred = regressor.predict(6.5)

# Visualizing the Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (Regression Model)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualizing the Regression results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), min(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Regression Model)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Regression Template

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('Position_Salaries.csv')

X = dataset.iloc[:, 1:2].values

y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set

"""from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling

"""from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)

X_test = sc_X.transform(X_test)

sc_y = StandardScaler()

y_train = sc_y.fit_transform(y_train)"""

# Fitting the Regression to the dataset

# Create your regressor here

# Predicting a new result

y_pred = regressor.predict(6.5)

# Visualizing the Regression results

plt.scatter(X, y, color = 'red')

plt.plot(X, regressor.predict(X), color = 'blue')

plt.title('Truth or Bluff (Regression Model)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Visualizing the Regression results (for higher resolution and smoother curve)

X_grid = np.arange(min(X), min(X), 0.1)

X_grid = X_grid.reshape((len(X_grid), 1))

plt.scatter(X, y, color = 'red')

plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')

plt.title('Truth or Bluff (Regression Model)')

plt.xlabel('Position level')

plt.ylabel('Salary')

plt.show()

# Regression Template

# Importing the dataset
dataset = read.csv('Position_Salaries.csv')
dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting the Regression Model to the dataset
# Create your regressor here

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Regression Model results
# install.packages('ggplot2')
# library(ggplot2)

ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),
            color = 'blue') +
  ggtitle('Truth or Bluff (Regression Model)') +
  xlab('Level') +
  ylab('Salary')

# Visualising the Regression Model results (for higher resolution and smoother curve)
# install.packages('ggplot2')
# library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)
ggplot() + 
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             color = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            color = 'blue') +
  ggtitle('Truth or Bluff (Regression Model)') +
  xlab('Level') +
  ylab('Salary')

# Regression Template

# Importing the dataset

dataset = read.csv('Position_Salaries.csv')

dataset = dataset[2:3]

# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

# library(caTools)

# set.seed(123)

# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)

# training_set = subset(dataset, split == TRUE)

# test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting the Regression Model to the dataset

# Create your regressor here

# Predicting a new result

y_pred = predict(regressor, data.frame(Level = 6.5))

# Visualising the Regression Model results

# install.packages('ggplot2')

# library(ggplot2)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),

color = 'blue') +

ggtitle('Truth or Bluff (Regression Model)') +

xlab('Level') +

ylab('Salary')

# Visualising the Regression Model results (for higher resolution and smoother curve)

# install.packages('ggplot2')

# library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)

ggplot() +

geom_point(aes(x = dataset$Level, y = dataset$Salary),

color = 'red') +

geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),

color = 'blue') +

ggtitle('Truth or Bluff (Regression Model)') +

xlab('Level') +

ylab('Salary')

Machine Learning A-Z: Part 2 – Regression (Multiple Linear Regression)

Dummy Variable Trap

Dummy variables must be:
D₂ = 1 – D₁

You cannot have more than one pair of dummy variables at the same time.

Building a model

PDF

1. All-in
=> 2. Backward Elimination
3. Forward Selection
4. Bidirectional Elimination
5. Score Comparison

Akaike information criterion (AIC) 赤池情報量規準

– a measure of the relative quality of statistical models for a given set of data.
– Given a collection of models for the data, estimates the quality of each model, relative to each of the other models.
– Hence, provides a means for model selection.

Multiple Linear Regression

Python

# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values # independent variables
y = dataset.iloc[:, 4].values # dependent variable

# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X[:, [0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X[:, [0,3,4,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X[:, [0,3,5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

X_opt = X[:, [0,3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

# Multiple Linear Regression

# Importing the libraries

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Importing the dataset

dataset = pd.read_csv('50_Startups.csv')

X = dataset.iloc[:, :-1].values # independent variables

y = dataset.iloc[:, 4].values # dependent variable

# Encoding categorical data

# Encoding the Independent Variable

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()

X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

onehotencoder = OneHotEncoder(categorical_features = [3])

X = onehotencoder.fit_transform(X).toarray()

# Avoiding the Dummy Variable Trap

X = X[:, 1:]

# Splitting the dataset into the Training set and Test set

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

"""from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)

X_test = sc_X.transform(X_test)

sc_y = StandardScaler()

y_train = sc_y.fit_transform(y_train)"""

# Fitting Multiple Linear Regression to the Training set

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

regressor.fit(X_train, y_train)

# Predicting the Test set results

y_pred = regressor.predict(X_test)

# Building the optimal model using Backward Elimination

import statsmodels.formula.api as sm

X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

X_opt = X[:, [0,1,2,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

X_opt = X[:, [0,1,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

X_opt = X[:, [0,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

X_opt = X[:, [0,3,5]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

X_opt = X[:, [0,3]]

regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

regressor_OLS.summary()

# Multiple Linear Regression

# Importing the dataset
dataset = read.csv('50_Startups.csv')

# Encoding categorical data
dataset$State = factor(dataset$State,
                       levels = c('New York', 'California', 'Florida'),
                       labels = c(1, 2, 3))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Profit, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting Multiple Linear Regression to the Training set
regressor = lm(formula = Profit ~ .,
               data = training_set)

# Predicting the Test set results
y_pred = predict(regressor, newdata = test_set)

# Building the optimal model using Backward Elimination
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,
               data = dataset)
summary(regressor)

regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
               data = dataset)
summary(regressor)

regressor = lm(formula = Profit ~ R.D.Spend,
               data = dataset)
summary(regressor)

# Multiple Linear Regression

# Importing the dataset

dataset = read.csv('50_Startups.csv')

# Encoding categorical data

dataset$State = factor(dataset$State,

levels = c('New York', 'California', 'Florida'),

labels = c(1, 2, 3))

# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

library(caTools)

set.seed(123)

split = sample.split(dataset$Profit, SplitRatio = 0.8)

training_set = subset(dataset, split == TRUE)

test_set = subset(dataset, split == FALSE)

# Feature Scaling

# training_set = scale(training_set)

# test_set = scale(test_set)

# Fitting Multiple Linear Regression to the Training set

regressor = lm(formula = Profit ~ .,

data = training_set)

# Predicting the Test set results

y_pred = predict(regressor, newdata = test_set)

# Building the optimal model using Backward Elimination

regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,

data = dataset)

summary(regressor)

regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,

data = dataset)

summary(regressor)

regressor = lm(formula = Profit ~ R.D.Spend,

data = dataset)

summary(regressor)

Clear environment of RStudio.

rm(list=ls())

1	rm(list=ls())

RStudio Keyboard Shortcuts