Acknowledgment: the materials below are partially based on Montgomery, D. C., Peck, E. A., Vining, G. G., Introduction to Linear Regression Analysis (5th Edition), Wiley Series in Probability and Statistics, 2012. This materials was initilated by Yichen Qin and modified by Tianhai Zu for teaching purpose.

Exercise 1

For the Chicago data set, do the following analysis.

d=read.csv("data_chicago.csv")
names(d)
## [1] "x" "y"
attach(d)
plot(x,y)
model2=lm(y~x)
summary(model2)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.628  -9.028  -0.009   7.882  77.861 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16.9952     4.9949   3.403 0.001528 ** 
## x             1.3135     0.3144   4.177 0.000155 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.46 on 40 degrees of freedom
## Multiple R-squared:  0.3037, Adjusted R-squared:  0.2863 
## F-statistic: 17.45 on 1 and 40 DF,  p-value: 0.0001553
abline(model2)

plot(y,model2$fitted.values)
abline(a=0,b=1)

hist(model2$residuals)

plot(model2$residuals,pch=20)
points(model2$residuals,type="h")
abline(h=0)

confint(model2,level=0.95)
##                 2.5 %    97.5 %
## (Intercept) 6.9001268 27.090188
## x           0.6779667  1.948945
detach(d)

Exercise 2

For the Delta Airlines data set, repeat the same question as in Exercise 1.

d=read.csv("data_delta.csv")
names(d)
## [1] "Day.of.month"            "Flight.Number"          
## [3] "Tail.Number"             "Destination.airport"    
## [5] "Airborne.time..minutes." "Distance...miles."
names(d)[5]="Time"
names(d)[6]="Dist"
attach(d)
plot(Dist,Time)
model3=lm(Time~Dist)
summary(model3)
## 
## Call:
## lm(formula = Time ~ Dist)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.981  -4.217  -0.542   4.253  83.151 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.5556457  0.1147633   83.26   <2e-16 ***
## Dist        0.1264290  0.0001282  985.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.004 on 17721 degrees of freedom
## Multiple R-squared:  0.9821, Adjusted R-squared:  0.9821 
## F-statistic: 9.719e+05 on 1 and 17721 DF,  p-value: < 2.2e-16
abline(model3)

plot(Time,model3$fitted.values)
abline(a=0,b=1)

hist(model3$residuals)

plot(model3$residuals,pch=20)
points(model3$residuals,type="h")
abline(h=0)

confint(model3,level=0.95)
##                 2.5 %    97.5 %
## (Intercept) 9.3306984 9.7805931
## Dist        0.1261777 0.1266804
detach(d)

Exercise 3

For the Vote data set, repeat the same question as in Exercise 1.

d=read.csv("data_vote.csv")
names(d)
## [1] "Congress"                 "Beginning.Year"          
## [3] "Average.Party.Unlikeness" "X..Party.Votes"
names(d)[3]="Unlikeness"
names(d)[4]="Percentage"
pairs(d)

attach(d)
model4=lm(Percentage~Unlikeness)
summary(model4)
## 
## Call:
## lm(formula = Percentage ~ Unlikeness)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8444 -2.3839 -0.4884  2.8817 10.1669 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.21475    2.13118   4.793 1.29e-05 ***
## Unlikeness   1.04519    0.04356  23.993  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.197 on 55 degrees of freedom
## Multiple R-squared:  0.9128, Adjusted R-squared:  0.9112 
## F-statistic: 575.7 on 1 and 55 DF,  p-value: < 2.2e-16
plot(Congress,Beginning.Year)
abline(model4)
lines(lowess(Percentage~Unlikeness))

plot(Unlikeness,Percentage,pch=20)

plot(model4$residuals,pch=20)
points(model4$residuals,type="h")
abline(h=0,col="red",lwd=3)

confint(model4,level=0.95)
##                 2.5 %    97.5 %
## (Intercept) 5.9437738 14.485729
## Unlikeness  0.9578917  1.132491