library(ggplot2) library(grid) library(stringr) library(reshape) library(scales) library(stringi) library(plyr) require(gdata) data <- read.csv('survey_analysis_data.csv') num_participants <- nrow(data) group_midpoints <- function(g) { cumsums = c(0, cumsum(g$Freq)) diffs = diff(cumsums) pos = head(cumsums, -1) + (0.5 * diffs) return(data.frame(value=g$value, pos=pos, Freq=g$Freq)) } chi_squared_multiple_test <- function(vector_one, vector_two, fileName){ combs <- expand.grid(vector_one,vector_two) combs <- data.frame(lapply(combs, as.character), stringsAsFactors=FALSE) results = data.frame( Row=rep(0, dim(combs)[1]), Column=rep(0,dim(combs)[1]), Chi.Square=rep(0,dim(combs)[1]), df=rep(0,dim(combs)[1]), p.value=rep(0,dim(combs)[1])) for (i in 1:dim(combs)[1]){ test <- chisq.test( data[[combs[[1]][i]]], data[[combs[[2]][i]]]) results[i, ] = c(combs[[1]][i] , combs[[2]][i] , round(test$statistic,7) , test$parameter , round(test$p.value, 7) ) } results$p.value <- as.numeric(results$p.value) adjusted<-p.adjust(results$p.value,method="bonferroni") #adjust using conservative bonferroni method (Holm's used by Robillard) results$p.value<-adjusted for (i in 1: dim(results)[1]){ if (results[i,]$p.value < 0.05){ cat(sprintf("\\pgfkeyssetvalue{%s_%s_p_value}{%0.4f}\n", combs[[1]][i], combs[[2]][i], results[i,]$p.value, '', sep='') ) } } write.csv(results, fileName) } plot_bar_chart <- function(plot, fileName,use_total_partic=FALSE,width=11.5, height=2.25, dpi=300){ plot <- plot + theme_classic() + geom_bar(width=0.4, fill="#808080",position=position_dodge(0.9)) + theme(axis.title.x=element_blank(), axis.title.y=element_blank(), axis.ticks.y=element_blank(), axis.text.y=element_blank(), axis.line.y=element_blank(), axis.text.x=element_text(size=26)) + scale_y_continuous(expand = c(0,0), limits=c(0,num_participants)) + #ylim(0,num_participants) + scale_x_discrete(drop=FALSE, labels = function(x) str_wrap(x, width = 10)) #FIXME: HARD CODED NUM PARTICIPANTS BECAUSE OF SOME COMPILATION ERROR!! if(use_total_partic){ plot + geom_text(aes(y = (..count..), label = paste(round((..count..)/37*100),sep='', "%")), stat="bin",colour="black",size=8,vjust=-0.5) }else{ plot + geom_text(aes(y = (..count..), label = paste(round((..count..)/sum(..count..)*100), sep='',"%")), stat="bin",colour="black",size=8,vjust=-0.5) } ggsave(fileName, width=width, height=height, dpi=dpi) } ####################Occupations############################# #aggregate occupations levels(data$Occupation)[match("Academic Researcher",levels(data$Occupation))] <- "Researcher" levels(data$Occupation)[match("Industrial Researcher",levels(data$Occupation))] <- "Researcher" levels(data$Occupation)[match("Graduate student",levels(data$Occupation))] <- "Student" levels(data$Occupation)[match("Undergraduate student",levels(data$Occupation))] <- "Student" levels(data$Occupation)[match("Freelance developer",levels(data$Occupation))] <- "Professional Developer" levels(data$Occupation)[match("Industrial developer",levels(data$Occupation))] <- "Professional Developer" c <- ggplot(data, aes(factor(Occupation, levels=c("Student", "Researcher", "Professional Developer", "Other"), ordered=TRUE))) plot_bar_chart(c, "../../figures/study4_occupation.pdf") ####################Java Experience############################# #add levels who no one picked levels(data$JavaExperience) <- c(levels(data$JavaExperience), "< 1 year") c <- ggplot(data, aes(factor(JavaExperience, levels=c("< 1 year", "1 - 2 years", "2 - 5 years", "6 - 10 years", "11+ years"), ordered=TRUE))) plot_bar_chart(c, "../../figures/study4_java_experience.pdf") ####################Cryptography Knowledge############################# levels(data$CryptoKnowledge) <- c(levels(data$CryptoKnowledge), "Not Knowledgeable") levels(data$CryptoKnowledge)[match("Very knowledgeable - I know all/most areas of cryptography, the different available algorithms, and what they are used for",levels(data$CryptoKnowledge))] <- "Very Knowledgeable" levels(data$CryptoKnowledge)[match("Somewhat knowledgeable - I have a vague idea about the various areas of cryptography and what they are used for",levels(data$CryptoKnowledge))] <- "Somewhat Knowledgeable" levels(data$CryptoKnowledge)[match("Knowledgeable - I am familiar with the various areas of cryptography and what they are used for",levels(data$CryptoKnowledge))] <- "Knowledgeable" c <- ggplot(data, aes(factor(CryptoKnowledge, levels=c("Not Knowledgeable", "Somewhat Knowledgeable", "Knowledgeable", "Very Knowledgeable"), ordered=TRUE))) plot_bar_chart(c, "../../figures/study4_crypto_knowledge.pdf") ####################Cryptography Use############################# levels(data$CryptoUse) <- c(levels(data$CryptoUse), "Never") levels(data$CryptoUse)[match("Rarely - I use cryptography in less than 33% of my development tasks", levels(data$CryptoUse))] <- "Rarely" levels(data$CryptoUse)[match("Occasionally - I use cryptography in more than 33% but less than 66% of my development tasks",levels(data$CryptoUse))] <- "Occasionally" levels(data$CryptoUse)[match("Frequently - I use cryptography in more than 66% of my development tasks",levels(data$CryptoUse))] <- "Frequently" #pdf("../../figures/study4_crypto_use.pdf") c <- ggplot(data, aes(factor(CryptoUse, levels=c("Never", "Rarely", "Occasionally", "Frequently"), ordered=TRUE))) plot_bar_chart(c, "../../figures/study4_crypto_use.pdf") ############Task_UserAuth############## levels(data$Task_UserAuth) <- c(levels(data$Task_UserAuth), "5") new_data = data[!is.na(data$Task_UserAuth), ] c <- ggplot(new_data, aes(factor(Task_UserAuth, levels=c("1", "2", "3", "4", "5"), ordered=TRUE))) #make sure to set use_total_partic because NA values will cause #percentages to be adjusted to new total without NAs plot_bar_chart(c, "../../figures/study4_userauth.pdf", use_total_partic=TRUE) ############Task_EncryptFile############## levels(data$Task_EncryptFile) <- c(levels(data$Task_EncryptFile), "5") new_data = data[!is.na(data$Task_EncryptFile), ] c <- ggplot(new_data, aes(factor(Task_EncryptFile, levels=c("1", "2", "3", "4", "5"), ordered=TRUE))) #make sure to set use_total_partic because NA values will cause #percentages to be adjusted to new total without NAs plot_bar_chart(c, "../../figures/study4_encrypt_file.pdf", use_total_partic=TRUE) ############Task_SecureConn############## levels(data$Task_SecureConn) <- c(levels(data$Task_SecureConn), "5") new_data = data[!is.na(data$Task_SecureConn), ] c <- ggplot(new_data, aes(factor(Task_SecureConn, levels=c("1", "2", "3", "4", "5"), ordered=TRUE))) #make sure to set use_total_partic because NA values will cause #percentages to be adjusted to new total without NAs plot_bar_chart(c, "../../figures/study4_secure_conn.pdf", use_total_partic=TRUE) ############Task_TransferFile############## new_data = data[!is.na(data$Task_TransferFile), ] c <- ggplot(new_data, aes(factor(Task_TransferFile, levels=c("1", "2", "3", "4", "5"), ordered=TRUE))) #make sure to set use_total_partic because NA values will cause #percentages to be adjusted to new total without NAs plot_bar_chart(c, "../../figures/study4_transfer_file.pdf", use_total_partic=TRUE) ############TaskOtherSign############## #note that adding new levels is currently data dependent.. here there #are already ranks at 5 so adding another level 5 causes problems #here we have to add 4 for example.. FIXME: there must be a smarter way #of doing this automatically levels(data$TaskOtherSign) <- c(levels(data$TaskOtherSign), "4") new_data = data[!is.na(data$TaskOtherSign), ] c <- ggplot(new_data, aes(factor(TaskOtherSign, levels=c("1", "2", "3", "4", "5"), ordered=TRUE))) #make sure to set use_total_partic because NA values will cause #percentages to be adjusted to new total without NAs plot_bar_chart(c, "../../figures/study4_signatures.pdf", use_total_partic=TRUE) ############Ranked Obstacles############## obstacles=c("CorrAlgm", "Concepts", "WhichJavaAPI", "Env", "Sequence", "Parameters", "Provider", "UnderlyImpl", "ErrorMessages") obstacle_data <- data[obstacles] obstacle_data$category <- row.names(obstacle_data) obstacle_data <- melt(obstacle_data, id.vars="category") dat <- data.frame(with(obstacle_data, table(variable,value))) #target=ordered(c("Frequently", # "Occasionally", # "Rarely", # "Never", # "Don't know")) #dat$value <- reorder.factor(dat$value, new.order=target) dat2 <- ddply(dat, .(variable), group_midpoints) dat2$pos[dat2$Freq %in% c(0)] <- -1 #hide 0% labels ggplot(dat2, aes(x = variable, fill = value,)) + geom_bar(aes(weight=Freq), position="stack", width=0.5) + #geom_bar(width=0.4,aes(y = (..count..)/37)) + theme_classic() + theme(legend.position="top", legend.direction="horizontal", legend.title=element_blank(), legend.text = element_text(size=8), axis.title.x=element_blank(), axis.text.x=element_text(size=10), panel.grid.minor = element_line(colour = "grey", linetype="dashed",size = 0.25)) + scale_y_continuous(expand = c(0,0), limits=c(0,41)) + scale_x_discrete(labels = c("CorrAlgm" = str_wrap("Identify Correct Algm",width=10), "Concepts" = str_wrap("Identify crypto concepts",width=10), "WhichJavaAPI" = str_wrap("Identify Java API",width=10), "Env" = str_wrap("Setup environment",width=10), "Sequence" = str_wrap("Idenitify sequence of API calls",width=12), "Parameters" = str_wrap("Identify parameters",width=10), "Provider" = str_wrap("Indentify provider",width=10), "UnderlyImpl" = str_wrap("Understand API implementation",width=10), "ErrorMessages" = str_wrap("Understand error msgs",width=10))) + scale_fill_grey(start = 0.2, end = .8) + geom_text(position = "identity", aes(x = variable, y = pos, label = paste(round(Freq/37*100),sep='', "%")), colour="white", size=3) + ylab("Number of Participants") ggsave("../../figures/study4_rated_obstacles.pdf", width=9, height=4.5, dpi=300) ##########output percentages to latex##################### occupation_percentages <- prop.table(table(data$Occupation)) java_exp_percentages <- prop.table(table(data$JavaExperience)) crypto_knowl_percentages <- prop.table(table(data$CryptoKnowledge)) crypto_use_percentages <- prop.table(table(data$CryptoUse)) secure_conn_counts <- table(data$Task_SecureConn) user_auth_counts <- table(data$Task_UserAuth) api_sequence_percentage<-prop.table(table(data$Sequence)) api_to_use_percentage<-prop.table(table(data$WhichJavaAPI)) underlyin_percentage<-prop.table(table(data$UnderlyImpl)) concepts_percentage<-prop.table(table(data$Concepts)) corr_algm_percentages<-prop.table(table(data$CorrAlgm)) param_percentages<-prop.table(table(data$Parameters)) corr_algm_percentages<-prop.table(table(data$CorrAlgm)) env_setup_percentages<-prop.table(table(data$Env)) sink("../../sections/survey_data.tex") #first two hard-coded for now cat(sprintf("\\pgfkeyssetvalue{total_responded}{43}\n")) cat(sprintf("\\pgfkeyssetvalue{ignored}{6}\n")) cat(sprintf("\\pgfkeyssetvalue{total_analyzed}{%d}\n", nrow(data))) cat(sprintf("\\pgfkeyssetvalue{percentage_students}{%d}\n", round(occupation_percentages['Student']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_professional}{%d}\n", round(occupation_percentages['Professional Developer']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_six_years}{%d}\n", round((java_exp_percentages['6 - 10 years']+java_exp_percentages['11+ years'])*100, digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_atleast_knowledgeable}{%d}\n", round((crypto_knowl_percentages['Knowledgeable']+crypto_knowl_percentages['Very Knowledgeable'])*100, digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_rarely_need_crypto}{%d}\n", round(crypto_use_percentages['Rarely']*100,digits=0), '',sep='')) #task ranks secure_conn_rank_1 <- round((secure_conn_counts[1]/num_participants)*100,digits=0) cat(sprintf("\\pgfkeyssetvalue{percentage_secure_comm_rank1}{%d}\n", secure_conn_rank_1, '',sep='')) user_auth_rank_1 <- round((user_auth_counts[1]/num_participants)*100,digits=0) cat(sprintf("\\pgfkeyssetvalue{percentage_user_auth_rank1}{%d}\n", user_auth_rank_1, '',sep='')) #replace NA in ranks with 10 duplicate <- data duplicate[is.na(duplicate)] <- 10 cat(sprintf("\\pgfkeyssetvalue{user_auth_avg_rank}{%.2f}\n", round(mean(duplicate$Task_UserAuth),digits=2), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{user_auth_count_ranked}{%d}\n", sum(duplicate$Task_UserAuth <10), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{secure_conn_avg_rank}{%.2f}\n", round(mean(duplicate$Task_SecureConn),digits=2), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{secure_conn_count_ranked}{%d}\n", sum(duplicate$Task_SecureConn<10), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{encrypt_file_avg_rank}{%.2f}\n", round(mean(duplicate$Task_EncryptFile),digits=2), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{encrypt_file_count_ranked}{%d}\n", sum(duplicate$Task_EncryptFile<10), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{transfer_file_avg_rank}{%.2f}\n", round(mean(duplicate$Task_TransferFile),digits=2), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{transfer_file_count_ranked}{%d}\n", sum(duplicate$Task_TransferFile<10), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{sign_avg_rank}{%.2f}\n", round(mean(duplicate$TaskOtherSign),digits=2), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{sign_count_ranked}{%d}\n", sum(duplicate$TaskOtherSign<10), '',sep='')) num_of_jca_users = sum(data$API_JCA == 1,na.rm=TRUE ) cat(sprintf("\\pgfkeyssetvalue{percentage_jca_users}{%d}\n", round(num_of_jca_users/num_participants*100,digits=0), '',sep='')) num_of_bc_users = sum(data$API_BC == 1,na.rm=TRUE ) cat(sprintf("\\pgfkeyssetvalue{percentage_bc_users}{%d}\n", round(num_of_bc_users/num_participants*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_bc_and_jca_users}{%d}\n", round((num_of_bc_users + num_of_jca_users)/num_participants*100,digits=0), '',sep='')) levels(data$EaseOfUse)[match("Very hard to use",levels(data$EaseOfUse))] <- "Hard to use" levels(data$EaseOfUse)[match("Very easy to use",levels(data$EaseOfUse))] <- "Easy to use" cat(sprintf("\\pgfkeyssetvalue{percentage_hard_to_use}{%d}\n", round(sum(data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_participants*100,digits=0), '',sep='')) jca_data = data[data$API_JCA == 1,] cat(sprintf("\\pgfkeyssetvalue{percentage_jca_hard_to_use}{%d}\n", round(sum(jca_data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_of_jca_users*100,digits=0), '',sep='')) bc_data = data[data$API_BC == 1,] cat(sprintf("\\pgfkeyssetvalue{percentage_bc_hard_to_use}{%d}\n", round(sum(bc_data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_of_bc_users*100,digits=0), '',sep='')) data$MostUsedLib <- "" data$MostUsedLib <- ifelse(data$API_JCA == 1, "JCA", "") data$MostUsedLib <- ifelse(data$API_BC == 1, "BC", data$MostUsedLib) test <- chisq.test(data$MostUsedLib,data$EaseOfUse) cat(sprintf("\\pgfkeyssetvalue{mostusedlib_easeofuse_pvalue}{%0.3f}\n", round(test$p.value, digits=3), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{num_of_partic_obstacles}{%d}\n", sum(stri_length(data$Obstacles)>10, na.rm=TRUE), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_sequence_frequently}{%d}\n", round(api_sequence_percentage['Frequently']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_api_to_use_frequently}{%d}\n", round(api_to_use_percentage['Frequently']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_underlying_frequently}{%d}\n", round(underlyin_percentage['Frequently']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_parameters_frequently}{%d}\n", round(param_percentages['Frequently']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_env_setup_frequently}{%d}\n", round(env_setup_percentages['Frequently']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_concepts_never}{%d}\n", round(concepts_percentage['Never']*100,digits=0), '',sep='')) cat(sprintf("\\pgfkeyssetvalue{percentage_algm_atleast_occassionally}{%d}\n", round((corr_algm_percentages['Occasionally']+corr_algm_percentages['Frequently'])*100, digits=0), '',sep='')) ##############################correlations##################################### background=c("Occupation", "JavaExperience", "CryptoKnowledge", "CryptoUse") obstacles=c( "CorrAlgm", "Concepts", "WhichJavaAPI", "Env", "Sequence", "Parameters", "Provider", "UnderlyImpl", "ErrorMessages") ##### chi squared between backround and ease of use rating chi_squared_multiple_test(background, c("EaseOfUse"), 'background_ease_of_use_chi_sq.csv') ##### chi squared between most used lib & obstacles chi_squared_multiple_test(c("MostUsedLib"), obstacles, 'most_used_lib_obstacles_chi_sq.csv') ##### chi squared test between background & obstacles chi_squared_multiple_test(background, obstacles, 'background_obstacles_chi_sq.csv') ##### spearman correlation between obstacles ######### # don't know --> 0 # never --> 1 # rarely --> 2 # occasionally --> 3 # frequently --> 4 correlation_data = data[obstacles] for (obstacle in obstacles ){ levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 0) levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 1) levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 2) levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 3) levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 4) } correlation_data[correlation_data == "Don't know"] <- 0 correlation_data[correlation_data == "Never"] <- 1 correlation_data[correlation_data == "Rarely"] <- 2 correlation_data[correlation_data == "Occasionally"] <- 3 correlation_data[correlation_data == "Frequently"] <- 4 #convert to numeric correlation_data[,obstacles] = as.numeric(as.character(unlist(correlation_data[,obstacles]))) correlations = cor(correlation_data, method="spearman") write.csv(correlations, 'correlations_spearman.csv') row_names=rownames(correlations) for (i in 1:dim(correlations)[1]){ for (j in 1:i){ if(i!=j && correlations[i,j] > 0.5){ obs1= row_names[i] obs2= row_names[j] test <- cor.test(correlation_data[[obs1]], correlation_data[[obs2]], method="spearman") cat(sprintf("\\pgfkeyssetvalue{%s_%s_correlation}{%0.2f}\n", obs1, obs2, round(correlations[i,j],digits=2), '', sep='') ) cat(sprintf("\\pgfkeyssetvalue{%s_%s_p_value}{%0.6f}\n", obs1, obs2, round(test$p.value,digits=6), '', sep='') ) } } } sink()