Cargamos nuestro querido iris data set (notemos que no hay valores perdidos.)
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
EstadĂsticos descriptivos muy básicos:
desc.est.iris = aggregate(iris[, c(1:4)], by = list(Tipo = iris$Species), FUN = function(x) {
c(mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE), max = min(x, na.rm = TRUE),
min = min(x, na.rm = TRUE), n = length(na.omit(x)), missing = length(x) -
length(na.omit(x)))
}, simplify = TRUE)
desc.est.iris
## Tipo Sepal.Length.mean Sepal.Length.sd Sepal.Length.max
## 1 setosa 5.0060 0.3525 4.3000
## 2 versicolor 5.9360 0.5162 4.9000
## 3 virginica 6.5880 0.6359 4.9000
## Sepal.Length.min Sepal.Length.n Sepal.Length.missing Sepal.Width.mean
## 1 4.3000 50.0000 0.0000 3.4280
## 2 4.9000 50.0000 0.0000 2.7700
## 3 4.9000 50.0000 0.0000 2.9740
## Sepal.Width.sd Sepal.Width.max Sepal.Width.min Sepal.Width.n
## 1 0.3791 2.3000 2.3000 50.0000
## 2 0.3138 2.0000 2.0000 50.0000
## 3 0.3225 2.2000 2.2000 50.0000
## Sepal.Width.missing Petal.Length.mean Petal.Length.sd Petal.Length.max
## 1 0.0000 1.4620 0.1737 1.0000
## 2 0.0000 4.2600 0.4699 3.0000
## 3 0.0000 5.5520 0.5519 4.5000
## Petal.Length.min Petal.Length.n Petal.Length.missing Petal.Width.mean
## 1 1.0000 50.0000 0.0000 0.2460
## 2 3.0000 50.0000 0.0000 1.3260
## 3 4.5000 50.0000 0.0000 2.0260
## Petal.Width.sd Petal.Width.max Petal.Width.min Petal.Width.n
## 1 0.1054 0.1000 0.1000 50.0000
## 2 0.1978 1.0000 1.0000 50.0000
## 3 0.2747 1.4000 1.4000 50.0000
## Petal.Width.missing
## 1 0.0000
## 2 0.0000
## 3 0.0000
# par(mfrow=c(2,2))
for (k in c(1:4)) {
boxplot(iris[, k] ~ iris$Species, main = paste(names(iris)[k + 1], " by Species"))
}
# par(mfrow=c(1,1))
cor(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000 -0.1176 0.8718 0.8179
## Sepal.Width -0.1176 1.0000 -0.4284 -0.3661
## Petal.Length 0.8718 -0.4284 1.0000 0.9629
## Petal.Width 0.8179 -0.3661 0.9629 1.0000
pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species", pch = 21, bg = c("red",
"green3", "blue")[unclass(iris$Species)])
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste(prefix, txt, sep = "")
if (missing(cex.cor))
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(iris[, 1:4], lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel = panel.hist,
pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])
Clustering
kmeans3 = kmeans(iris[, 1:4], centers = 3, algorithm = "Lloyd")
str(kmeans3)
## List of 7
## $ cluster : int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
## $ centers : num [1:3, 1:4] 5.01 6.85 5.9 3.43 3.07 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 681
## $ withinss : num [1:3] 15.2 23.9 39.8
## $ tot.withinss: num 78.9
## $ betweenss : num 603
## $ size : int [1:3] 50 38 62
## - attr(*, "class")= chr "kmeans"
kmeans3$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 2 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3
pairs(iris[1:4], main = "Anderson's Iris Data -- 3 kmeans partition", pch = 21,
bg = c("red", "green3", "blue")[unclass(kmeans3$cluster)])
Matriz de confusiĂłn (confusion matrix)
table(iris$Species, kmeans3$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 2 48
## virginica 0 36 14
Clustering jerárquico:
jerarquico = hclust(dist(iris[, 1:4], method = "euclidean"), method = "average")
plot(jerarquico)
cluster3.h = cutree(h, 3)
## Error: object 'h' not found
cluster3.h
## Error: object 'cluster3.h' not found
No se ve bien
plot(jerarquico, labels = iris$Species)
plot(jerarquico, labels = kmeans3$cluster)
Componentes principales
sol.pca = princomp(iris[, 1:4], cor = TRUE)
summary(sol.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.7084 0.9560 0.38309 0.143926
## Proportion of Variance 0.7296 0.2285 0.03669 0.005179
## Cumulative Proportion 0.7296 0.9581 0.99482 1.000000
screeplot(sol.pca)
biplot(sol.pca)
ÂżComo representar mejor el cĂrculo de correlaciĂłn?….
VisualizaciĂłn de todos los componentes por species
pairs(sol.pca$scores[, c(1:4)], pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])
Y ahora por las 3 clases de kmeans
pairs(sol.pca$scores[, c(1:4)], pch = 21, bg = c("red", "green3", "blue")[unclass(kmeans3$cluster)])