1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
|
headers = unique(modelTable$HEADERS)
modelTable[, CLUSTER:=integer(nrow(modelTable))]
for (h in headers){
#dt = modelTable
dt = modelTable[h==modelTable$HEADERS]
header = dt$HEADERS
headersu = unique(header)
d = stringdistmatrix(headersu, headersu, method="jw")
distmat = as.dist(d, diag = T)
if (length(headersu)<=2 ){
dt[, CLUSTER:=(1:nrow(dt))]
}else{
k = kmeans(distmat, min(nrow(d)-1, length(headersu)))
i = 1
for (a in headersu){
dt[a==header, CLUSTER:= k$cluster[i]]
i = i+1
}
}
modelTable[h==modelTable$HEADERS, CLUSTER:=dt$CLUSTER]
}
groupTable = unique(modelTable, by=c("HEADERS", "CLUSTER"))
groupTable[, NBCLUSTERS:=max(CLUSTER), by= "HEADERS"] |
Partager