1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| # Packages
library(tm) # Text mining: Corpus and Document Term Matrix
library(class) # KNN model
library(SnowballC) # Stemming words
#library(Matrix)
# Read csv with two columns: text and category
df <- read.csv(file.choose(), sep =",", header = TRUE)
myDTM <- as.DocumentTermMatrix(df, weighting = weightTf)
# Transform dtm to matrix to data frame - df is easier to work with
mat.df <- as.data.frame(data.matrix(myDTM), stringsAsfactors = FALSE)
mat.df <- cbind(mat.df, df$Category)
# Split data by rownumber into two equal portions
train <- sample(nrow(myDTM), ceiling(nrow(myDTM) * .70))
test <- (1:nrow(myDTM))[- train]
train
test
# Create model: training set, test set, training set classifier
knn.pred <- knn(modeldata[train, ], modeldata[test, ],cl,k=3)## error comment trouver cl
# Confusion matrix
conf.mat <- table("Predictions" = knn.pred, Actual = cl[test])
conf.mat
# Accuracy
(accuracy <- sum(diag(conf.mat))/length(test) * 100)
# Create data frame with test data and predicted category
df.pred <- cbind(knn.pred, modeldata[test, ])
write.table(df.pred, file="output.csv", sep=";") |
Partager