Para acabar el iris data set

Cargamos nuestro querido iris data set (notemos que no hay valores perdidos.)

data(iris)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Estadísticos descriptivos muy básicos:

desc.est.iris = aggregate(iris[, c(1:4)], by = list(Tipo = iris$Species), FUN = function(x) {
    c(mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE), max = min(x, na.rm = TRUE), 
        min = min(x, na.rm = TRUE), n = length(na.omit(x)), missing = length(x) - 
            length(na.omit(x)))
}, simplify = TRUE)
desc.est.iris

##         Tipo Sepal.Length.mean Sepal.Length.sd Sepal.Length.max
## 1     setosa            5.0060          0.3525           4.3000
## 2 versicolor            5.9360          0.5162           4.9000
## 3  virginica            6.5880          0.6359           4.9000
##   Sepal.Length.min Sepal.Length.n Sepal.Length.missing Sepal.Width.mean
## 1           4.3000        50.0000               0.0000           3.4280
## 2           4.9000        50.0000               0.0000           2.7700
## 3           4.9000        50.0000               0.0000           2.9740
##   Sepal.Width.sd Sepal.Width.max Sepal.Width.min Sepal.Width.n
## 1         0.3791          2.3000          2.3000       50.0000
## 2         0.3138          2.0000          2.0000       50.0000
## 3         0.3225          2.2000          2.2000       50.0000
##   Sepal.Width.missing Petal.Length.mean Petal.Length.sd Petal.Length.max
## 1              0.0000            1.4620          0.1737           1.0000
## 2              0.0000            4.2600          0.4699           3.0000
## 3              0.0000            5.5520          0.5519           4.5000
##   Petal.Length.min Petal.Length.n Petal.Length.missing Petal.Width.mean
## 1           1.0000        50.0000               0.0000           0.2460
## 2           3.0000        50.0000               0.0000           1.3260
## 3           4.5000        50.0000               0.0000           2.0260
##   Petal.Width.sd Petal.Width.max Petal.Width.min Petal.Width.n
## 1         0.1054          0.1000          0.1000       50.0000
## 2         0.1978          1.0000          1.0000       50.0000
## 3         0.2747          1.4000          1.4000       50.0000
##   Petal.Width.missing
## 1              0.0000
## 2              0.0000
## 3              0.0000

# par(mfrow=c(2,2))
for (k in c(1:4)) {
    boxplot(iris[, k] ~ iris$Species, main = paste(names(iris)[k + 1], " by Species"))
}

plot of chunk unnamed-chunk-3

# par(mfrow=c(1,1))

cor(iris[, 1:4])

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length       1.0000     -0.1176       0.8718      0.8179
## Sepal.Width       -0.1176      1.0000      -0.4284     -0.3661
## Petal.Length       0.8718     -0.4284       1.0000      0.9629
## Petal.Width        0.8179     -0.3661       0.9629      1.0000

pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species", pch = 21, bg = c("red", 
    "green3", "blue")[unclass(iris$Species)])

plot of chunk unnamed-chunk-5

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste(prefix, txt, sep = "")
    if (missing(cex.cor)) 
        cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r)
}
panel.hist <- function(x, ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5))
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks
    nB <- length(breaks)
    y <- h$counts
    y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}

pairs(iris[, 1:4], lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel = panel.hist, 
    pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

plot of chunk unnamed-chunk-6

Clustering

kmeans3 = kmeans(iris[, 1:4], centers = 3, algorithm = "Lloyd")
str(kmeans3)

## List of 7
##  $ cluster     : int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
##  $ centers     : num [1:3, 1:4] 5.01 6.85 5.9 3.43 3.07 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 681
##  $ withinss    : num [1:3] 15.2 23.9 39.8
##  $ tot.withinss: num 78.9
##  $ betweenss   : num 603
##  $ size        : int [1:3] 50 38 62
##  - attr(*, "class")= chr "kmeans"

kmeans3$cluster

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 2 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3

pairs(iris[1:4], main = "Anderson's Iris Data -- 3 kmeans partition", pch = 21, 
    bg = c("red", "green3", "blue")[unclass(kmeans3$cluster)])

plot of chunk unnamed-chunk-7

Matriz de confusión (confusion matrix)

table(iris$Species, kmeans3$cluster)

##             
##               1  2  3
##   setosa     50  0  0
##   versicolor  0  2 48
##   virginica   0 36 14

Clustering jerárquico:

jerarquico = hclust(dist(iris[, 1:4], method = "euclidean"), method = "average")
plot(jerarquico)

plot of chunk unnamed-chunk-9


cluster3.h = cutree(h, 3)

## Error: object 'h' not found

cluster3.h

## Error: object 'cluster3.h' not found

No se ve bien

plot(jerarquico, labels = iris$Species)

plot of chunk unnamed-chunk-10

plot(jerarquico, labels = kmeans3$cluster)

plot of chunk unnamed-chunk-10

Componentes principales

sol.pca = princomp(iris[, 1:4], cor = TRUE)
summary(sol.pca)

## Importance of components:
##                        Comp.1 Comp.2  Comp.3   Comp.4
## Standard deviation     1.7084 0.9560 0.38309 0.143926
## Proportion of Variance 0.7296 0.2285 0.03669 0.005179
## Cumulative Proportion  0.7296 0.9581 0.99482 1.000000

screeplot(sol.pca)

plot of chunk unnamed-chunk-11

biplot(sol.pca)

plot of chunk unnamed-chunk-11

¿Como representar mejor el círculo de correlación?….

Visualización de todos los componentes por species

pairs(sol.pca$scores[, c(1:4)], pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

plot of chunk unnamed-chunk-12

Y ahora por las 3 clases de kmeans

pairs(sol.pca$scores[, c(1:4)], pch = 21, bg = c("red", "green3", "blue")[unclass(kmeans3$cluster)])

plot of chunk unnamed-chunk-13