1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
require(cluster)
require(class)
require(stats)
library(RWeka)
require(stringer)
data <- read.table("breast-cancer-wisconsin.data",header=F,sep=",",stringsAsFactors=F)
head(data)
names(data) <- c('id','ct','ucsize','ucshape','ma','secs','bn','bc','nn','miti','class')
head(data)
#clean up data
require(stringr)
#remove whitespace
data <-t(apply(data, 1, function(x) {str_replace(x, "\\s+", "")}))
data <-t(apply(data, 1, function(x) {str_replace(x, "\\D", NA)}))
#I'm not sure what's the best way to deal with NAs
#so I'll just remove them
to_numeric <- function(x) as.numeric(as.character(x))
data <- modifyList(data, lapply(data, to_numeric))
sapply(data,mode)
data <- na.omit(data)
#lost a few data points
dim(data)
fit <- kmeans(data[,c(2:10)], 2)
names(fit)
#k-means did a fairly good job
table(data.frame(fit$cluster,data[,11]))
table(data.frame(fit$cluster,data[,11]))
result$withinss
train = data[c(1:137),] # données à classer
test = fit$center[,] # données étiquetée
cl1 <- factor(c(rep("2",), rep("4",))
pred = knn(train, test,cl1 , k = 1)
table(pred[],class=test[]) |
Partager