1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| # Import encounters (merged) data, plus the individuals table.
enc <- read.csv('~/Desktop/2A/speed_dating/merged_dataset.csv')
ind <- read.csv('~/Desktop/2A/speed_dating/individuals_table.csv')
# Reformat ethnicity variable.
for (var in c('race', 'race_o')) {
enc[, var] <- as.factor(enc$race)
levels(enc[, var]) <- c('black', 'caucasian', 'hispanic', 'asian', 'other')
}
# Reformat share-ethinicity dummy variable.
enc$same_race <- as.factor(enc$same_race)
# Reformat subjective scores given to (and from) partners,
# as well as decision dummy variables.
scores <- c('attr', 'sinc', 'intel', 'fun', 'amb')
scores <- c(scores, paste0(scores, '_o'))
decision <- c('match', 'dec', 'dec_o')
for (var in c(scores, decision)) {
enc[, var] <- as.factor(enc[, var])
}
# Recode career variable (at individual level)
career <- ind$career_c
career[career %in% c(1, 7, 8, 9, 11, 13, 16)] <- 1
career[career %in% c(2, 3, 4, 5, 12)] <- 2
career[career %in% c(6, 17)] <- 3
career[career == 14] <- 4
career[career %in% c(10, 15)] <- 5
career <- as.factor(career)
levels(career) <- c(
'social sciences', 'hard sciences', 'arts', 'sports', 'other/undecided'
)
names(career) <- ind$iid
# Merge the previous information into the ecounters data.
enc$career <- sapply(as.character(enc$iid), function(x) career[x])
enc$career_o <- sapply(as.character(enc$pid), function(x) career[x])
# Reformat hobbies variables.
hobbies <- colnames(ind)[grep('^hobbies', colnames(ind))]
# Identify best-graded hobbie for each individual.
favorite <- hobbies_names[apply(ind[, hobbies], 1, which.max)]
names(favorite) <- ind$iid
# Distribute the information in the encounter table.
enc$hobbies <- sapply(as.character(enc$iid), function(x) favorite[x])
enc$hobbies_o <- sapply(as.character(enc$pid), function(x) favorite[x])
# Reformat the resulting columns.
hobbies_names <- unlist(strsplit(hobbies, '_'))[seq(2, 2 * length(hobbies), 2)]
for (var in c('hobbies', 'hobbies_o')) {
enc[, var] <- as.factor(enc[, var])
levels(enc[, var]) <- hobbies_names
}
# Alternative: run a PCA, run a HCPC clutering, use cluster pertenancy.
#pca <- FactoMineR::PCA(ind[, hobbies], ncp=5, graph=FALSE)
#hcpc <- FactoMineR::HCPC(pca, nb.clust=4, graph=FALSE)
#hobbies <- hcpc$data.clust$clust
#names(hobbies) <- ind$iid
#enc$hobbies <- sapply(as.character(enc$iid), function(x) hobbies[x])
#enc$hobbies_o <- sapply(as.character(enc$pid), function(x) hobbies[x])
# Subselect the data to keep.
varnames <- c(
'race', 'race_o', 'same_race', scores, 'career', 'career_o',
'hobbies', 'hobbies_o', decision
)
enc <- enc[, varnames]
# Run the MCA
mca <- FactoMineR::MCA(enc, quali.sup=18:20) |
Partager