CC BY-NC-ND 3.0
bdd <- read.table("./DATA/bdd.csv", header = TRUE, sep = ",", dec = ".")
bdd$date <- as.POSIXct(
paste0(
bdd$year, "-",
bdd$month, "-",
bdd$day, " ",
bdd$hour, ":",
bdd$minute, ":",
bdd$second),
format = "%Y-%m-%d %H:%M:%S"
)
bdd$date <- as.numeric(bdd$date)
set.seed(12345)
sampleP <- sample(1:nrow(bdd), size = 1000)
bdd <- bdd[sampleP, c("temperature", "pressure", "date")]
str(bdd)
## 'data.frame': 1000 obs. of 3 variables:
## $ temperature: num 26.78 9.79 3.85 6.91 26.58 ...
## $ pressure : num 995 1039 1066 1032 997 ...
## $ date : num 1.57e+09 1.55e+09 1.55e+09 1.55e+09 1.56e+09 ...
getNiceLmPlot <- function(myX, myY, myYlim = c(-10, 50)){
mod041 <- lm(myY ~ myX)
myCol <- colorRampPalette(c("green", "blue", "red"))(101)
colRank <- (myY[!is.na(myX)] - predict(mod041))^2
colRank <- round((colRank - min(colRank)) /
(max(colRank) - min(colRank)) * 100) + 1
plot(x = myX[!is.na(myX)], y = myY[!is.na(myX)],
col = myCol[colRank], pch = 16,
axes = FALSE, ylim = myYlim, panel.first = {
grid()
axis(1)
axis(2)
})
points(
x = myX[!is.na(myX)][order(myX[!is.na(myX)])],
y = predict(mod041)[order(myX[!is.na(myX)])],
type = 'l', lwd = 2)
}
getNiceLmPlot(
myX = bdd$pressure,
myY = bdd$temperature)
getNicePolyLmPlot <- function(myX, myY, myYlim = c(-10, 50), polyN = 2){
mod041 <- lm(myY ~ poly(myX, polyN))
myCol <- colorRampPalette(c("green", "blue", "red"))(101)
colRank <- (myY[!is.na(myX)] - predict(mod041))^2
colRank <- round((colRank - min(colRank)) /
(max(colRank) - min(colRank)) * 100) + 1
plot(x = myX[!is.na(myX)], y = myY[!is.na(myX)],
col = myCol[colRank], pch = 16,
axes = FALSE, ylim = myYlim, panel.first = {
grid()
axis(1)
axis(2)
})
points(
x = myX[!is.na(myX)][order(myX[!is.na(myX)])],
y = predict(mod041)[order(myX[!is.na(myX)])],
type = 'l', lwd = 2)
return(mod041)
}
modPoly <- getNicePolyLmPlot(
myX = bdd$pressure,
myY = bdd$temperature)
##
## Call:
## lm(formula = myY ~ poly(myX, polyN))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0500 -2.1855 0.2745 2.4241 10.2494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7864 0.1162 92.87 <2e-16 ***
## poly(myX, polyN)1 -231.5078 3.6730 -63.03 <2e-16 ***
## poly(myX, polyN)2 65.3138 3.6730 17.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.673 on 997 degrees of freedom
## Multiple R-squared: 0.8114, Adjusted R-squared: 0.811
## F-statistic: 2144 on 2 and 997 DF, p-value: < 2.2e-16
On peut voir que quand la pression atmosphérique est basse, parfois la température reste basse, ce qui explique la distribution non Normale des résidus. Nous allons ignorer ce problème ici qui pourrait être résolu en conditionnant les valeurs de pression atmosphérique à certaines heures de la journée.
##
## Shapiro-Wilk normality test
##
## data: resid(modPoly)
## W = 0.98265, p-value = 1.562e-09
##
## Call:
## lm(formula = myY ~ poly(myX, polyN))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2992 -3.9771 -0.4574 3.5839 25.8801
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7864 0.1883 57.293 <2e-16 ***
## poly(myX, polyN)1 189.0865 5.9536 31.760 <2e-16 ***
## poly(myX, polyN)2 14.8321 5.9536 2.491 0.0129 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.954 on 997 degrees of freedom
## Multiple R-squared: 0.5045, Adjusted R-squared: 0.5035
## F-statistic: 507.5 on 2 and 997 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = myY ~ poly(myX, polyN))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5665 -3.5522 -0.1015 2.9821 22.7406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7864 0.1624 66.402 < 2e-16 ***
## poly(myX, polyN)1 189.0865 5.1369 36.810 < 2e-16 ***
## poly(myX, polyN)2 14.8321 5.1369 2.887 0.00397 **
## poly(myX, polyN)3 -95.1672 5.1369 -18.526 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.137 on 996 degrees of freedom
## Multiple R-squared: 0.6315, Adjusted R-squared: 0.6303
## F-statistic: 568.8 on 3 and 996 DF, p-value: < 2.2e-16
\[Y=\beta_1 + \beta_2X + \beta_3x^2+...+\beta_px^p+\epsilon\]
myX = bdd$date
myY = bdd$temperature
nbrDeg <- 4 # essayer avec 20 ;-)
mod_X <- lapply(1:nbrDeg, function(i){
lm(myY ~ poly(myX, i))
})
plot(x = myX[!is.na(myX)], y = myY[!is.na(myX)],
col = "gray", pch = 16,
axes = FALSE, ylim = c(-10, 50), panel.first = {
grid()
axis(1)
axis(2)
})
for(i in seq_along(mod_X))
points(
x = myX[!is.na(myX)][order(myX[!is.na(myX)])],
y = predict(mod_X[[i]])[order(myX[!is.na(myX)])],
type = 'l', lwd = 2, col = i)
legend("topleft", lwd = 2, col = 1:length(mod_X),
legend = paste0("Reg. Lin. Poly. deg. ", 1:nbrDeg))