library(ggplot2)
library(grid)
library(stringr)
library(reshape)
library(scales)
library(stringi)
library(plyr)
require(gdata)

data <- read.csv('survey_analysis_data.csv')
num_participants <- nrow(data)

group_midpoints <- function(g) {
  cumsums = c(0, cumsum(g$Freq))
  diffs = diff(cumsums)
  pos = head(cumsums, -1) + (0.5 * diffs)
  return(data.frame(value=g$value, pos=pos, Freq=g$Freq))
}

chi_squared_multiple_test <- function(vector_one, vector_two, fileName){
	combs <- expand.grid(vector_one,vector_two)
	combs <- data.frame(lapply(combs, as.character), stringsAsFactors=FALSE)
	results = data.frame( Row=rep(0, dim(combs)[1]), 
		Column=rep(0,dim(combs)[1]), 
		Chi.Square=rep(0,dim(combs)[1]), 
		df=rep(0,dim(combs)[1]),
		p.value=rep(0,dim(combs)[1]))

	for (i in 1:dim(combs)[1]){
			test <- chisq.test( data[[combs[[1]][i]]], data[[combs[[2]][i]]])

			results[i, ] = c(combs[[1]][i]
	                    , combs[[2]][i]
	                    , round(test$statistic,7)
	                    ,  test$parameter
	                    ,  round(test$p.value, 7)
	                    )
	}

	results$p.value <- as.numeric(results$p.value)
	adjusted<-p.adjust(results$p.value,method="bonferroni") #adjust using conservative bonferroni method (Holm's used by Robillard)
	results$p.value<-adjusted

	for (i in 1: dim(results)[1]){
		if (results[i,]$p.value < 0.05){
			cat(sprintf("\\pgfkeyssetvalue{%s_%s_p_value}{%0.4f}\n",
					combs[[1]][i],
					combs[[2]][i],
					results[i,]$p.value,
					'',
					sep='')
				)
		}
	}
	write.csv(results, fileName)
}

plot_bar_chart <- function(plot, fileName,use_total_partic=FALSE,width=11.5, height=2.25, dpi=300){
plot <-	plot + 
theme_classic() +
geom_bar(width=0.4, fill="#808080",position=position_dodge(0.9)) + 
theme(axis.title.x=element_blank(), 
	axis.title.y=element_blank(), 
	axis.ticks.y=element_blank(), 
	axis.text.y=element_blank(),
	axis.line.y=element_blank(),
	axis.text.x=element_text(size=26)) +
scale_y_continuous(expand = c(0,0), limits=c(0,num_participants)) +
#ylim(0,num_participants) +
scale_x_discrete(drop=FALSE, labels = function(x) str_wrap(x, width = 10))
#FIXME: HARD CODED NUM PARTICIPANTS BECAUSE OF SOME COMPILATION ERROR!!
if(use_total_partic){
	plot + geom_text(aes(y = (..count..), 
       	label = paste(round((..count..)/37*100),sep='', "%")), 
       stat="bin",colour="black",size=8,vjust=-0.5)
}else{
	plot + geom_text(aes(y = (..count..), 
       	label = paste(round((..count..)/sum(..count..)*100), sep='',"%")), 
       stat="bin",colour="black",size=8,vjust=-0.5)
}
ggsave(fileName, width=width, height=height, dpi=dpi)
}

####################Occupations#############################
#aggregate occupations
levels(data$Occupation)[match("Academic Researcher",levels(data$Occupation))] <- "Researcher"
levels(data$Occupation)[match("Industrial Researcher",levels(data$Occupation))] <- "Researcher"
levels(data$Occupation)[match("Graduate student",levels(data$Occupation))] <- "Student"
levels(data$Occupation)[match("Undergraduate student",levels(data$Occupation))] <- "Student"
levels(data$Occupation)[match("Freelance developer",levels(data$Occupation))] <- "Professional Developer"
levels(data$Occupation)[match("Industrial developer",levels(data$Occupation))] <- "Professional Developer"

c <- ggplot(data, aes(factor(Occupation,
		levels=c("Student", 
			"Researcher", 
			"Professional Developer", 
			"Other"), 
		ordered=TRUE)))

plot_bar_chart(c, "../../figures/study4_occupation.pdf")

####################Java Experience#############################
#add levels who no one picked
levels(data$JavaExperience) <- c(levels(data$JavaExperience), "< 1 year")

c <- ggplot(data, aes(factor(JavaExperience,
		levels=c("< 1 year", 
			"1 - 2 years", 
			"2 - 5 years", 
			"6 - 10 years", 
			"11+ years"), 
		ordered=TRUE)))

plot_bar_chart(c, "../../figures/study4_java_experience.pdf")

####################Cryptography Knowledge#############################
levels(data$CryptoKnowledge) <- c(levels(data$CryptoKnowledge), "Not Knowledgeable")

levels(data$CryptoKnowledge)[match("Very knowledgeable - I know all/most areas of cryptography, the different available algorithms, and what they are used for",levels(data$CryptoKnowledge))] <- "Very Knowledgeable"
levels(data$CryptoKnowledge)[match("Somewhat knowledgeable - I have a vague idea about the various areas of cryptography and what they are used for",levels(data$CryptoKnowledge))] <- "Somewhat Knowledgeable"
levels(data$CryptoKnowledge)[match("Knowledgeable - I am familiar with the various areas of cryptography and what they are used for",levels(data$CryptoKnowledge))] <- "Knowledgeable"

c <- ggplot(data, aes(factor(CryptoKnowledge,
		levels=c("Not Knowledgeable", 
			"Somewhat Knowledgeable", 
			"Knowledgeable", 
			"Very Knowledgeable"), 
		ordered=TRUE)))

plot_bar_chart(c, "../../figures/study4_crypto_knowledge.pdf")

####################Cryptography Use#############################
levels(data$CryptoUse) <- c(levels(data$CryptoUse), "Never")

levels(data$CryptoUse)[match("Rarely - I use cryptography in less than 33% of my development tasks", levels(data$CryptoUse))] <- "Rarely"
levels(data$CryptoUse)[match("Occasionally - I use cryptography in more than 33% but less than 66% of my development tasks",levels(data$CryptoUse))] <- "Occasionally"
levels(data$CryptoUse)[match("Frequently - I use cryptography in more than 66% of my development tasks",levels(data$CryptoUse))] <- "Frequently"

#pdf("../../figures/study4_crypto_use.pdf")
c <- ggplot(data, aes(factor(CryptoUse,
		levels=c("Never", 
			"Rarely", 
			"Occasionally", 
			"Frequently"), 
		ordered=TRUE)))

plot_bar_chart(c, "../../figures/study4_crypto_use.pdf")

############Task_UserAuth##############
levels(data$Task_UserAuth) <- c(levels(data$Task_UserAuth), "5")
new_data = data[!is.na(data$Task_UserAuth), ]

c <- ggplot(new_data, aes(factor(Task_UserAuth,
		levels=c("1", 
			"2", 
			"3",
			"4",
			"5"), 
		ordered=TRUE)))

#make sure to set use_total_partic because NA values will cause
#percentages to be adjusted to new total without NAs
plot_bar_chart(c, "../../figures/study4_userauth.pdf", use_total_partic=TRUE)

############Task_EncryptFile##############
levels(data$Task_EncryptFile) <- c(levels(data$Task_EncryptFile), "5")
new_data = data[!is.na(data$Task_EncryptFile), ]

c <- ggplot(new_data, aes(factor(Task_EncryptFile,
		levels=c("1", 
			"2", 
			"3",
			"4",
			"5"), 
		ordered=TRUE)))

#make sure to set use_total_partic because NA values will cause
#percentages to be adjusted to new total without NAs
plot_bar_chart(c, "../../figures/study4_encrypt_file.pdf", use_total_partic=TRUE)

############Task_SecureConn##############
levels(data$Task_SecureConn) <- c(levels(data$Task_SecureConn), "5")
new_data = data[!is.na(data$Task_SecureConn), ]

c <- ggplot(new_data, aes(factor(Task_SecureConn,
		levels=c("1", 
			"2", 
			"3",
			"4",
			"5"), 
		ordered=TRUE)))

#make sure to set use_total_partic because NA values will cause
#percentages to be adjusted to new total without NAs
plot_bar_chart(c, "../../figures/study4_secure_conn.pdf", use_total_partic=TRUE)

############Task_TransferFile##############
new_data = data[!is.na(data$Task_TransferFile), ]

c <- ggplot(new_data, aes(factor(Task_TransferFile,
		levels=c("1", 
			"2", 
			"3",
			"4",
			"5"), 
		ordered=TRUE)))

#make sure to set use_total_partic because NA values will cause
#percentages to be adjusted to new total without NAs
plot_bar_chart(c, "../../figures/study4_transfer_file.pdf", use_total_partic=TRUE)

############TaskOtherSign##############
#note that adding new levels is currently data dependent.. here there
#are already ranks at 5 so adding another level 5 causes problems 
#here we have to add 4 for example.. FIXME: there must be a smarter way
#of doing this automatically
levels(data$TaskOtherSign) <- c(levels(data$TaskOtherSign), "4")
new_data = data[!is.na(data$TaskOtherSign), ]

c <- ggplot(new_data, aes(factor(TaskOtherSign,
		levels=c("1", 
			"2", 
			"3",
			"4",
			"5"), 
		ordered=TRUE)))

#make sure to set use_total_partic because NA values will cause
#percentages to be adjusted to new total without NAs
plot_bar_chart(c, "../../figures/study4_signatures.pdf", use_total_partic=TRUE)


############Ranked Obstacles##############
obstacles=c("CorrAlgm", 
	"Concepts", 
	"WhichJavaAPI",
	"Env",
	"Sequence",
	"Parameters",
	"Provider",
	"UnderlyImpl",
	"ErrorMessages")
							
obstacle_data <- data[obstacles]
obstacle_data$category <- row.names(obstacle_data)
obstacle_data <- melt(obstacle_data, id.vars="category")
dat <- data.frame(with(obstacle_data, table(variable,value)))
#target=ordered(c("Frequently", 
			# "Occasionally", 
			# "Rarely",
			# "Never",
			# "Don't know"))
#dat$value <- reorder.factor(dat$value, new.order=target)			
dat2 <- ddply(dat, .(variable), group_midpoints)
dat2$pos[dat2$Freq %in% c(0)] <- -1 #hide 0% labels


ggplot(dat2, aes(x = variable, fill = value,)) + 
geom_bar(aes(weight=Freq), position="stack", width=0.5) +
#geom_bar(width=0.4,aes(y = (..count..)/37)) + 
theme_classic() +
theme(legend.position="top", 
	legend.direction="horizontal",
	legend.title=element_blank(),
	legend.text = element_text(size=8),
	axis.title.x=element_blank(),
	axis.text.x=element_text(size=10),
	panel.grid.minor = element_line(colour = "grey", linetype="dashed",size = 0.25)) +
scale_y_continuous(expand = c(0,0), limits=c(0,41)) +
scale_x_discrete(labels = c("CorrAlgm" = str_wrap("Identify Correct Algm",width=10),
	"Concepts" = str_wrap("Identify crypto concepts",width=10),
	"WhichJavaAPI" = str_wrap("Identify Java API",width=10),
	"Env" = str_wrap("Setup environment",width=10),
	"Sequence" = str_wrap("Idenitify sequence of API calls",width=12),
	"Parameters" = str_wrap("Identify parameters",width=10),
	"Provider" = str_wrap("Indentify provider",width=10),
	"UnderlyImpl" = str_wrap("Understand API implementation",width=10),
	"ErrorMessages" = str_wrap("Understand error msgs",width=10))) +
scale_fill_grey(start = 0.2, end = .8) +
geom_text(position = "identity", 
	aes(x = variable, y = pos, label = paste(round(Freq/37*100),sep='', "%")), 
	colour="white", 
	size=3) +
ylab("Number of Participants")
ggsave("../../figures/study4_rated_obstacles.pdf", width=9, height=4.5, dpi=300)

##########output percentages to latex#####################
occupation_percentages <- prop.table(table(data$Occupation))
java_exp_percentages <- prop.table(table(data$JavaExperience))
crypto_knowl_percentages <- prop.table(table(data$CryptoKnowledge))
crypto_use_percentages <- prop.table(table(data$CryptoUse))
secure_conn_counts <- table(data$Task_SecureConn)
user_auth_counts <- table(data$Task_UserAuth)
api_sequence_percentage<-prop.table(table(data$Sequence))
api_to_use_percentage<-prop.table(table(data$WhichJavaAPI))
underlyin_percentage<-prop.table(table(data$UnderlyImpl))
concepts_percentage<-prop.table(table(data$Concepts))
corr_algm_percentages<-prop.table(table(data$CorrAlgm))
param_percentages<-prop.table(table(data$Parameters))
corr_algm_percentages<-prop.table(table(data$CorrAlgm))
env_setup_percentages<-prop.table(table(data$Env))

sink("../../sections/survey_data.tex")
#first two hard-coded for now
cat(sprintf("\\pgfkeyssetvalue{total_responded}{43}\n"))
cat(sprintf("\\pgfkeyssetvalue{ignored}{6}\n"))
cat(sprintf("\\pgfkeyssetvalue{total_analyzed}{%d}\n",
	nrow(data)))
cat(sprintf("\\pgfkeyssetvalue{percentage_students}{%d}\n",
	round(occupation_percentages['Student']*100,digits=0),
	'',sep='')) 
cat(sprintf("\\pgfkeyssetvalue{percentage_professional}{%d}\n",
	round(occupation_percentages['Professional Developer']*100,digits=0),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{percentage_six_years}{%d}\n",
	round((java_exp_percentages['6 - 10 years']+java_exp_percentages['11+ years'])*100,
		digits=0),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{percentage_atleast_knowledgeable}{%d}\n",
	round((crypto_knowl_percentages['Knowledgeable']+crypto_knowl_percentages['Very Knowledgeable'])*100,
		digits=0),
	'',sep='')) 

cat(sprintf("\\pgfkeyssetvalue{percentage_rarely_need_crypto}{%d}\n",
	round(crypto_use_percentages['Rarely']*100,digits=0),
	'',sep=''))

#task ranks
secure_conn_rank_1 <- round((secure_conn_counts[1]/num_participants)*100,digits=0)
cat(sprintf("\\pgfkeyssetvalue{percentage_secure_comm_rank1}{%d}\n",
	secure_conn_rank_1,
	'',sep=''))

user_auth_rank_1 <- round((user_auth_counts[1]/num_participants)*100,digits=0)
cat(sprintf("\\pgfkeyssetvalue{percentage_user_auth_rank1}{%d}\n",
	user_auth_rank_1,
	'',sep=''))

#replace NA in ranks with 10
duplicate <- data
duplicate[is.na(duplicate)] <- 10
cat(sprintf("\\pgfkeyssetvalue{user_auth_avg_rank}{%.2f}\n",
	round(mean(duplicate$Task_UserAuth),digits=2),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{user_auth_count_ranked}{%d}\n",
	sum(duplicate$Task_UserAuth <10),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{secure_conn_avg_rank}{%.2f}\n",
	round(mean(duplicate$Task_SecureConn),digits=2),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{secure_conn_count_ranked}{%d}\n",
	sum(duplicate$Task_SecureConn<10),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{encrypt_file_avg_rank}{%.2f}\n",
	round(mean(duplicate$Task_EncryptFile),digits=2),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{encrypt_file_count_ranked}{%d}\n",
	sum(duplicate$Task_EncryptFile<10),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{transfer_file_avg_rank}{%.2f}\n",
	round(mean(duplicate$Task_TransferFile),digits=2),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{transfer_file_count_ranked}{%d}\n",
	sum(duplicate$Task_TransferFile<10),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{sign_avg_rank}{%.2f}\n",
	round(mean(duplicate$TaskOtherSign),digits=2),
	'',sep=''))
cat(sprintf("\\pgfkeyssetvalue{sign_count_ranked}{%d}\n",
	sum(duplicate$TaskOtherSign<10),
	'',sep=''))

num_of_jca_users = sum(data$API_JCA == 1,na.rm=TRUE )
cat(sprintf("\\pgfkeyssetvalue{percentage_jca_users}{%d}\n",
	round(num_of_jca_users/num_participants*100,digits=0),
	'',sep=''))

num_of_bc_users = sum(data$API_BC == 1,na.rm=TRUE )
cat(sprintf("\\pgfkeyssetvalue{percentage_bc_users}{%d}\n",
	round(num_of_bc_users/num_participants*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_bc_and_jca_users}{%d}\n",
	round((num_of_bc_users + num_of_jca_users)/num_participants*100,digits=0),
	'',sep=''))


levels(data$EaseOfUse)[match("Very hard to use",levels(data$EaseOfUse))] <- "Hard to use"
levels(data$EaseOfUse)[match("Very easy to use",levels(data$EaseOfUse))] <- "Easy to use"

cat(sprintf("\\pgfkeyssetvalue{percentage_hard_to_use}{%d}\n",
	round(sum(data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_participants*100,digits=0),
	'',sep=''))

jca_data = data[data$API_JCA == 1,]
cat(sprintf("\\pgfkeyssetvalue{percentage_jca_hard_to_use}{%d}\n",
	round(sum(jca_data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_of_jca_users*100,digits=0),
	'',sep=''))

bc_data = data[data$API_BC == 1,]
cat(sprintf("\\pgfkeyssetvalue{percentage_bc_hard_to_use}{%d}\n",
	round(sum(bc_data$EaseOfUse == "Hard to use",na.rm=TRUE )/num_of_bc_users*100,digits=0),
	'',sep=''))

data$MostUsedLib <- ""
data$MostUsedLib <- ifelse(data$API_JCA == 1, "JCA", "")
data$MostUsedLib <- ifelse(data$API_BC == 1, "BC", data$MostUsedLib)
test <- chisq.test(data$MostUsedLib,data$EaseOfUse)

cat(sprintf("\\pgfkeyssetvalue{mostusedlib_easeofuse_pvalue}{%0.3f}\n",
	round(test$p.value, digits=3),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{num_of_partic_obstacles}{%d}\n",
	sum(stri_length(data$Obstacles)>10, na.rm=TRUE),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_sequence_frequently}{%d}\n",
	round(api_sequence_percentage['Frequently']*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_api_to_use_frequently}{%d}\n",
	round(api_to_use_percentage['Frequently']*100,digits=0),
	'',sep=''))


cat(sprintf("\\pgfkeyssetvalue{percentage_underlying_frequently}{%d}\n",
	round(underlyin_percentage['Frequently']*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_parameters_frequently}{%d}\n",
	round(param_percentages['Frequently']*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_env_setup_frequently}{%d}\n",
	round(env_setup_percentages['Frequently']*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_concepts_never}{%d}\n",
	round(concepts_percentage['Never']*100,digits=0),
	'',sep=''))

cat(sprintf("\\pgfkeyssetvalue{percentage_algm_atleast_occassionally}{%d}\n",
	round((corr_algm_percentages['Occasionally']+corr_algm_percentages['Frequently'])*100,
		digits=0),
	'',sep=''))

##############################correlations#####################################
background=c("Occupation",
	"JavaExperience",
	"CryptoKnowledge",
	"CryptoUse")

obstacles=c(
 	"CorrAlgm",
 	"Concepts",
 	"WhichJavaAPI",
 	"Env",
 	"Sequence",
 	"Parameters",
 	"Provider",
 	"UnderlyImpl",
 	"ErrorMessages")

##### chi squared between backround and ease of use rating
chi_squared_multiple_test(background, c("EaseOfUse"), 'background_ease_of_use_chi_sq.csv')

##### chi squared between most used lib & obstacles
chi_squared_multiple_test(c("MostUsedLib"), obstacles, 'most_used_lib_obstacles_chi_sq.csv')

##### chi squared test between background & obstacles
chi_squared_multiple_test(background, obstacles, 'background_obstacles_chi_sq.csv')


##### spearman correlation between obstacles #########
# don't know --> 0
# never --> 1
# rarely --> 2
# occasionally --> 3
# frequently --> 4
correlation_data = data[obstacles]

for (obstacle in obstacles ){
	levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 0)
	levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 1)
	levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 2)
	levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 3)
	levels(correlation_data[[obstacle]]) <- c(levels(correlation_data[[obstacle]]), 4)
}
correlation_data[correlation_data == "Don't know"] <- 0
correlation_data[correlation_data == "Never"] <- 1
correlation_data[correlation_data == "Rarely"] <- 2
correlation_data[correlation_data == "Occasionally"] <- 3
correlation_data[correlation_data == "Frequently"] <- 4

#convert to numeric
correlation_data[,obstacles] = as.numeric(as.character(unlist(correlation_data[,obstacles])))

correlations = cor(correlation_data, method="spearman")
write.csv(correlations, 'correlations_spearman.csv')

row_names=rownames(correlations)

for (i in 1:dim(correlations)[1]){
	for (j in 1:i){
		if(i!=j && correlations[i,j] > 0.5){
			obs1= row_names[i]
			obs2= row_names[j]
			test <- cor.test(correlation_data[[obs1]], correlation_data[[obs2]], method="spearman")
			cat(sprintf("\\pgfkeyssetvalue{%s_%s_correlation}{%0.2f}\n",
				obs1,
				obs2,
				round(correlations[i,j],digits=2),
				'',
				sep='')
			)
			cat(sprintf("\\pgfkeyssetvalue{%s_%s_p_value}{%0.6f}\n",
				obs1,
				obs2,
				round(test$p.value,digits=6),
				'',
				sep='')
			)
		}
	}
}


sink()