Multiple Linear Regression in R

Data called 50_Startups.csv

R&D Spend	Administration	Marketing Spend	State	Profit
165349.2	136897.8	471784.1	New York	192261.8
162597.7	151377.6	443898.5	California	191792.1
153441.5	101145.6	407934.5	Florida	191050.4
144372.4	118671.9	383199.6	New York	182902
142107.3	91391.77	366168.4	Florida	166187.9
131876.9	99814.71	362861.4	New York	156991.1
134615.5	147198.9	127716.8	California	156122.5
130298.1	145530.1	323876.7	Florida	155752.6
120542.5	148719	311613.3	New York	152211.8
123334.9	108679.2	304981.6	California	149760
101913.1	110594.1	229161	Florida	146122
100672	91790.61	249744.6	California	144259.4
93863.75	127320.4	249839.4	Florida	141585.5
91992.39	135495.1	252664.9	California	134307.4
119943.2	156547.4	256512.9	Florida	132602.7
114523.6	122616.8	261776.2	New York	129917
78013.11	121597.6	264346.1	California	126992.9
94657.16	145077.6	282574.3	New York	125370.4
91749.16	114175.8	294919.6	Florida	124266.9
86419.7	153514.1	0	New York	122776.9
76253.86	113867.3	298664.5	California	118474
78389.47	153773.4	299737.3	New York	111313
73994.56	122782.8	303319.3	Florida	110352.3
67532.53	105751	304768.7	Florida	108734
77044.01	99281.34	140574.8	New York	108552
64664.71	139553.2	137962.6	California	107404.3
75328.87	144136	134050.1	Florida	105733.5
72107.6	127864.6	353183.8	New York	105008.3
66051.52	182645.6	118148.2	Florida	103282.4
65605.48	153032.1	107138.4	New York	101004.6
61994.48	115641.3	91131.24	Florida	99937.59
61136.38	152701.9	88218.23	New York	97483.56
63408.86	129219.6	46085.25	California	97427.84
55493.95	103057.5	214634.8	Florida	96778.92
46426.07	157693.9	210797.7	California	96712.8
46014.02	85047.44	205517.6	New York	96479.51
28663.76	127056.2	201126.8	Florida	90708.19
44069.95	51283.14	197029.4	California	89949.14
20229.59	65947.93	185265.1	New York	81229.06
38558.51	82982.09	174999.3	California	81005.76
28754.33	118546.1	172795.7	California	78239.91
27892.92	84710.77	164470.7	Florida	77798.83
23640.93	96189.63	148001.1	California	71498.49
15505.73	127382.3	35534.17	New York	69758.98
22177.74	154806.1	28334.72	California	65200.33
1000.23	124153	1903.93	New York	64926.08
1315.46	115816.2	297114.5	Florida	49490.75
0	135426.9	0	California	42559.73
542.05	51743.15	0	New York	35673.41
0	116983.8	45173.06	California	14681.4

Step 1

#Import

dataset1 = read.csv('50_Startups.csv')

Step 2

# Encoding categorical data

Dataset1$State = factor(dataset1$State,

levels = c('New York', 'California', 'Florida'),

labels = c(1, 2, 3))

Step 3

# Fitting Multiple Linear Regression to the Training set

Regresor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,

Data = training_set)

regressor = lm(formula = Profit ~ .,

data = training_set)

summary(refressor)

Call:

lm(formula = Profit ~ ., data = training_set)

Residuals:

Min 1Q Median 3Q Max

-33128 -4865 5 6098 18065

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 4.965e+04 7.637e+03 6.501 1.94e-07 ***

R.D.Spend 7.986e-01 5.604e-02 14.251 6.70e-16 ***

Administration -2.942e-02 5.828e-02 -0.505 0.617

Marketing.Spend 3.268e-02 2.127e-02 1.537 0.134

State2 1.213e+02 3.751e+03 0.032 0.974

State3 2.376e+02 4.127e+03 0.058 0.954

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9908 on 34 degrees of freedom

Multiple R-squared: 0.9499, Adjusted R-squared: 0.9425

F-statistic: 129 on 5 and 34 DF, p-value: < 2.2e-16

# Predicting the Test set results

y_pred = predict(regressor, newdata = test_set)

Step 4

# Building the optimal model using Backward Elimination

regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,

data = dataset1)

summary(regressor)

Call:

lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend +

State, data = dataset1)

Residuals:

Min 1Q Median 3Q Max

-33504 -4736 90 6672 17338

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 5.008e+04 6.953e+03 7.204 5.76e-09 ***

R.D.Spend 8.060e-01 4.641e-02 17.369 < 2e-16 ***

Administration -2.700e-02 5.223e-02 -0.517 0.608

Marketing.Spend 2.698e-02 1.714e-02 1.574 0.123

State2 4.189e+01 3.256e+03 0.013 0.990

State3 2.407e+02 3.339e+03 0.072 0.943

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9439 on 44 degrees of freedom

Multiple R-squared: 0.9508, Adjusted R-squared: 0.9452

F-statistic: 169.9 on 5 and 44 DF, p-value: < 2.2e-16

State is not significant so removing

Step 5

regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,

data = dataset1)

summary(regressor)

Call:

lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,

data = dataset1)

Residuals:

Min 1Q Median 3Q Max

-33534 -4795 63 6606 17275

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 5.012e+04 6.572e+03 7.626 1.06e-09 ***

R.D.Spend 8.057e-01 4.515e-02 17.846 < 2e-16 ***

Administration -2.682e-02 5.103e-02 -0.526 0.602

Marketing.Spend 2.723e-02 1.645e-02 1.655 0.105

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9232 on 46 degrees of freedom

Multiple R-squared: 0.9507, Adjusted R-squared: 0.9475

F-statistic: 296 on 3 and 46 DF, p-value: < 2.2e-16

Administration is not significant so removing

Step 6

regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend,

data = dataset1)

summary(regressor)

Call:

lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset1)

Residuals:

Min 1Q Median 3Q Max

-33645 -4632 -414 6484 17097

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 4.698e+04 2.690e+03 17.464 <2e-16 ***

R.D.Spend 7.966e-01 4.135e-02 19.266 <2e-16 ***

Marketing.Spend 2.991e-02 1.552e-02 1.927 0.06 .

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9161 on 47 degrees of freedom

Multiple R-squared: 0.9505, Adjusted R-squared: 0.9483

F-statistic: 450.8 on 2 and 47 DF, p-value: < 2.2e-16

Marketing.Spend is not significant so removing.

Step 7

regressor = lm(formula = Profit ~ R.D.Spend,

data = dataset1)

summary(regressor)

Call:

lm(formula = Profit ~ R.D.Spend, data = dataset1)

Residuals:

Min 1Q Median 3Q Max

-34351 -4626 -375 6249 17188

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 4.903e+04 2.538e+03 19.32 <2e-16 ***

R.D.Spend 8.543e-01 2.931e-02 29.15 <2e-16 ***

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9416 on 48 degrees of freedom

Multiple R-squared: 0.9465, Adjusted R-squared: 0.9454

F-statistic: 849.8 on 1 and 48 DF, p-value: < 2.2e-16

Step 8

y_pred = predict(regressor, newdata = test_set)

y_pred

step 9

#visualising the training set results

#load ggplot2 library

ggplot()+

geom_point(aes(x=training_set$R.D.Spend, y=training_set$Profit),

color = 'red')+

geom_line(aes(x=training_set$R.D.Spend, y=predict(regressor ,newdata = training_set)),

color = 'blue')+

ggtitle('Profit vs R.D.Spend(training_set)')+

xlab("R.D.Spend")+

ylab("Profit")

#plot interpretation

red points are real Profit

the blue line is our linear regression model

#visualising the test set results

#load ggplot2 library

ggplot()+

geom_point(aes(x=test_set$YearsExperience, y=test_set$Salary),

color = 'red')+

geom_line(aes(x=training_set$YearsExperience, y=predict(regressor ,newdata = training_set)),

color = 'blue')+

ggtitle('salary vs experience(test_set)')+

xlab("years of experience")+

ylab("salary")

#plot interpretation

red points are real Profit,

the blue line is our linear regression model, these are very much near to predicted values.

Search This Blog

R Code

Multiple Linear Regression in R

Comments

Post a Comment

Popular posts from this blog

Decision Tree Classification

The Multi-Armed Bandit Problem -Upper Confidence Bound (Ad Campaign)

View Data from frames