suppressMessages(library(CORElearn)) suppressMessages(library(dplyr)) suppressMessages(library(plyr)) suppressMessages(library(data.table)) suppressMessages(library(randomForest)) suppressMessages(library(ggplot2)) suppressMessages(library(grid)) suppressMessages(library(argparser)) suppressMessages(library(stringr)) export_ds_for_spm <- function(target_event,episodes_list,output){ if (file.exists(output)) { file.remove(output) } #output for HirateYamana for(ep_index in (1:length(episodes_list))){ ep = episodes_list[[ep_index]][ , !(names(episodes_list[[ep_index]]) %in% c("Timestamps"))] ep_list = list() for(i in (1:nrow(ep))){ matches = which(ep[i,] %in% c(1)) if(length(matches) == 0){ next } line=paste(matches,collapse=" ") ep_list[i] = line } if(length(ep_list) == 0){ next } ep_list[length(ep_list)+1] = target_event episode = "" for(ep_lli in (1:length(ep_list))){ if(length(ep_list[[ep_lli]]) > 0){ index = paste(paste("<",ep_lli,sep=""),">",sep="") if(episode == ""){ episode = paste(index,ep_list[[ep_lli]],sep=" ") } else { episode = paste(episode,paste(index,ep_list[[ep_lli]],sep=" "),sep=" -1 ") } } } write(paste(episode,"-1 -2"),file=output,append=TRUE) } } remove_rare_events <- function(ds,target_event_frequency_proportion_rare){ if(!csv){ print("~~~~~~~APPLYING PROPREPROCESSING: REMOVE RARE EVENTS~~~~~~~") } a = table(ds$Event_id) target_event_frequency = a[names(a)==target_event] rare_events = as.integer(names(a[a < target_event_frequency*target_event_frequency_proportion_rare])) return(ds[!(ds$Event_id %in% rare_events),]) } remove_frequent_events <- function(ds,max_event_frequency_proportion_frequent){ if(!csv){ print("~~~~~~~APPLYING PROPREPROCESSING: REMOVE FREQUENT EVENTS~~~~~~~") } a = table(ds$Event_id) max_freq = sort(a,decreasing = TRUE)[[1]] frequent_events = as.integer(names(a[a > max_freq*max_event_frequency_proportion_frequent])) return(ds[!(ds$Event_id %in% frequent_events),]) } keep_only_first_occureness <- function(episodes_list){ if(!csv){ print("~~~~~~~APPLYING PROPREPROCESSING: KEEP ONLY FIRST OCCURENESS~~~~~~~") } #for every episode in the episodes_list for(ep_index in (1:length(episodes_list))){ ep = episodes_list[[ep_index]] #For every segment of each episode starting from the end up to the second segment. #We need to keep only the 1st occurness of consequtive events, hence starting from the end is the easy way. for(i in (nrow(ep):2)){ #as we deal with binary vectors, to find the indeces that both vectors have "1" we sum them and check for "2"s in the result matches = which((ep[i,]+ep[i-1,]) %in% c(2)) #replace the 1s with 0s in the matching positions of the segment that is closer to the end of the episode ep[i,][c(matches)] = 0 } episodes_list[[ep_index]] = ep } return(episodes_list) } mil_text <- function(milw,F_thres,episodes_list,b_length){ if(!csv){ print("~~~~~~~APPLYING PROPREPROCESSING: MULTI INSTANCE LEARNING~~~~~~~") } window_df = data.frame(matrix(ncol = b_length+2, nrow = 0)) #for every episode in the episodes_list for(ep_index in (1:length(episodes_list))){ ep = episodes_list[[ep_index]] new_ep = data.frame(matrix(ncol = b_length+2, nrow = 0)) i = 1 while(i <= nrow(ep)){ new_ep = rbind(new_ep,ep[i,]) if(ep[i,][b_length+2] >= F_thres && nrow(window_df) < milw){ window_df = rbind(window_df,ep[i,]) } if(nrow(window_df) == milw || i == nrow(ep)){ mean = colMeans(window_df) mean[mean > 0] = 1 mf = data.frame(as.list(mean)) mf[1] = ep[i,][1] mf[b_length+2] = ep[i,][b_length+2] #colnames(mf) = colnames(new_ep) new_ep = rbind(new_ep,mf) if(nrow(window_df) > 1){ i = i - (nrow(window_df)-2) } window_df = data.frame(matrix(ncol = b_length+2, nrow = 0)) } i = i + 1 } episodes_list[[ep_index]] = new_ep } return(episodes_list) } mil_image <- function(milw,F_thres,episodes_list,b_length){ if(!csv){ print("~~~~~~~APPLYING PROPREPROCESSING: MULTI INSTANCE LEARNING~~~~~~~") } #for every episode in the episodes_list for(ep_index in (1:length(episodes_list))){ ep = episodes_list[[ep_index]] new_ep = data.frame(matrix(ncol = b_length+2, nrow = 0)) #a data.frame with the vectors that need to be averaged window_df = data.frame(matrix(ncol = b_length+2, nrow = 0)) i = 1 while(i <= nrow(ep)){ #new_ep = rbind(new_ep,ep[i,]) if(nrow(window_df) < milw){ window_df = rbind(window_df,ep[i,]) } if(nrow(window_df) == milw || i == nrow(ep)){ mean = colMeans(window_df) mean[mean > 0] = 1 mf = data.frame(as.list(mean)) mf[1] = ep[i,][1] mf[b_length+2] = ep[i,][b_length+2] #colnames(mf) = colnames(new_ep) new_ep = rbind(new_ep,mf) if(window_df[1,][b_length+2] >= F_thres && nrow(window_df) > 1){ i = i - (nrow(window_df)-1) } window_df = data.frame(matrix(ncol = b_length+2, nrow = 0)) } i = i + 1 } episodes_list[[ep_index]] = new_ep } return(episodes_list) } #the Risk function compute_F <- function(s,midpoint,t,ep_length){ #s affects the steepness # s <- 0.9 return(1/(1+exp(s*(ep_length-midpoint-t)))) } #convert event vectors to binary vectors compute_frequency_vectors <- function(aggr_episode_df,b_length,s,midpoint){ freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+2, nrow = 0)) x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F")) # colnames(bin_aggr_episode_df) <- x for(i in 1:nrow(aggr_episode_df)) { #init a vector with 3405 0s freq_vector = as.vector(integer(b_length)) seg <- aggr_episode_df[i,] #if segment contains the j number, replace the 0 in the bin_vector with 1 for(value in seg$x[[1]]){ freq_vector[[value]] = length(which(seg$x[[1]] == value)) } #add a new line to the bin_aggr_epissode_df #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types F = compute_F(s,midpoint,i-1,nrow(aggr_episode_df)) date = seg$Timeframe[[1]] new_df = data.frame(matrix(c(date, freq_vector,F),nrow=1,ncol=b_length+2)) freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df) } # x <- c(c("Timestamps"), c(paste("e_",c(1:3405))), c("Risk_F")) colnames(freq_aggr_episode_df) <- x return(freq_aggr_episode_df) } create_episodes_list <- function(ds,target_event,b_length,s,midpoint){ if(!csv){ print("~~~~~~~CREATING FREQUENCY VECTORS AND BINARIZE THEM~~~~~~~") } #devide in episodes target_event_spotted = FALSE #a list with data.frames for the episodes (each episode one data.frame) episodes_list = list() #data.frame for episodes episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer()) #iterate over every line of the original dataset for(i in 1:nrow(ds)) { #get the current row of the ds meas <- ds[i,] #If it is the target event enable the appropriate flag if((meas$Event_id == target_event)){ target_event_spotted = TRUE } #fill the episode data.frame with the events that are between two target events if(meas$Event_id != target_event && target_event_spotted){ episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id)) } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) != 0){ #a second occurness of the target event is spotted, close the episode #target_event_spotted = FALSE #aggregate by day all the events to form the segments inside the episodes aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(Timeframe=cut(as.POSIXct(episode_df$Timestamps, format="%Y-%m-%d"),"day"))) #%Y-%m-%dT%H:%M:%OSZ #binarize the frequncy vector bin_aggr_episode_df = compute_frequency_vectors(aggr_episode_df,b_length,s,midpoint) #Remove event 0, which does not provide any info KOUGKA #bin_aggr_episode_df = bin_aggr_episode_df[ , !(names(bin_aggr_episode_df) %in% c("e_1"))] #add the episode to the episodes_list episodes_list[[length(episodes_list)+1]] = bin_aggr_episode_df #reset episode_df to en empty data.frame episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer()) } } return(episodes_list) } preprocess <- function(ds,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,w,F_thres){ #Remove events that appear < n times. We consider n = (target event frequency)/2 if(REMOVE_RARE_EVENTS){ ds<-remove_rare_events(ds,target_event_frequency_proportion_rare) } #Remove events that appear < n times. We consider n = (target event frequency)/2 if(REMOVE_FREQUENT_EVENTS){ ds<-remove_frequent_events(ds,max_event_frequency_proportion_frequent) } episodes_list = create_episodes_list(ds,target_event,b_length,s,midpoint) #binarize the vector for(ep_index in (1:length(episodes_list))){ ep = episodes_list[[ep_index]] ep[2:(ncol(ep)-1)][ep[2:(ncol(ep)-1)] > 0] = 1 episodes_list[[ep_index]] = ep } # keep only the first occurness of event in consecutive segments if(KEEP_ONLY_FIRST_OCCURENESS){ episodes_list <- keep_only_first_occureness(episodes_list) } # Multi-instance learning to increase the pattern frequency if(MULTI_INSTANCE_LEARNING_TEXT){ episodes_list <- mil_text(w,F_thres,episodes_list,b_length) } else if(MULTI_INSTANCE_LEARNING_IMAGE){ episodes_list <- mil_image(w,F_thres,episodes_list,b_length) } return(episodes_list) } feature_selection <- function(merged_episodes,top_features){ estReliefF <- attrEval(Risk_F ~ ., merged_episodes, estimator="RReliefFexpRank", ReliefIterations=50) sorted_indeces = order(estReliefF, decreasing = TRUE) merged_episodes = merged_episodes %>% select(sorted_indeces[1:top_features],ncol(merged_episodes)) return(merged_episodes) } read_dataset <- function(path){ dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#") dataset[, 2] <- as.numeric(dataset[, 2]) return(dataset) } eval <- function(train_episodes,test_episodes_list,seed){ set.seed(seed) my.rf = randomForest(Risk_F ~ .,data=train_episodes,importance=TRUE) #varImpPlot(my.rf) false_positives = 0 true_positives = 0 false_negatives = 0 for(ep in test_episodes_list){ ep = ep[ , !(names(ep) %in% c("Timestamps"))] Prediction <- predict(my.rf, ep) ep_legth = length(Prediction) pred_indeces = as.numeric(names(Prediction[Prediction >= acceptance_threshold])) if(length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) > 0){ false_positives = false_positives + length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) } if(length(pred_indeces[pred_indeces >= (ep_legth-(max_warning_interval)) & pred_indeces <= (ep_legth-min_warning_interval)]) > 0){ true_positives = true_positives + 1 } else { false_negatives = false_negatives + 1 } } precision = true_positives/(true_positives+false_positives) if((true_positives+false_positives) == 0){ precision = 0 } recall = true_positives/length(test_episodes_list) F1 = 2*((precision*recall)/(precision+recall)) if((precision+recall) == 0){ F1 = 0 } if(!csv){ cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\n")) } else { cat(paste(argv$test,",",true_positives,",",false_positives,",",false_negatives,",",precision,",",recall,",",F1,",",argv$fet,",",argv$tet,",",argv$rre,",",argv$rfe,",",argv$kofe,",",argv$mili,",",argv$milt,",",argv$fs,",",argv$top,",",argv$rer,",",argv$fer,",",argv$seed,",",argv$steepness,",",argv$pthres,",",argv$milw,",",argv$milthres,",",argv$midpoint,",",argv$minwint,",",argv$maxwint,"\n",sep="")) } return(my.rf) } plot <- function(test_episodes_list, episode_index, my.rf){ test_episodes = test_episodes_list[[episode_index]][ , !(names(test_episodes_list[[episode_index]]) %in% c("Timestamps"))] Prediction <- predict(my.rf, test_episodes) results = data.frame(Risk_F=test_episodes$Risk_F,num_Prediction=as.numeric(Prediction)) mse = mean((Prediction-test_episodes$Risk_F)^2) chart =ggplot(results,aes((1:nrow(results)))) + # geom_rect(aes(xmin = ceiling(nrow(df_test)/2), xmax = nrow(df_test), ymin = -Inf, ymax = Inf), # fill = "yellow", alpha = 0.003) + geom_line(aes(y = Risk_F, colour = "Actual")) + geom_line(aes(y = num_Prediction, colour="Predicted")) + labs(colour="Lines") + xlab("Segments") + ylab('Risk (F)') + ggtitle("Risk Prediction") + # (RR_KF_2YEARS_PAT08) theme(plot.title = element_text(hjust = 0.5)) + geom_text(aes(label = paste("MSE=",round(mse,3)), x = 20, y = 1), hjust = -2, vjust = 6, color="black", size=4) #add MSE label # Disable clip-area so that the MSE is shown in the plot gt <- ggplot_gtable(ggplot_build(chart)) gt$layout$clip[gt$layout$name == "panel"] <- "off" grid.draw(gt) } p <- arg_parser("Implementation of the AIRBUS Predictor") # Add a positional argument p <- add_argument(p, "id", help="experiment ID") p <- add_argument(p, "train", help="training dataset") p <- add_argument(p, "test", help="test dataset") p <- add_argument(p, "fet", help="different types of the fault events",default=151) p <- add_argument(p, "tet", help="type of the target fault events",default=151) p <- add_argument(p, "--rre", help="remove rare events", default=TRUE) p <- add_argument(p, "--rfe", help="remove frequent events", default=TRUE) p <- add_argument(p, "--kofe", help="keep only first event", default=TRUE) p <- add_argument(p, "--milt", help="MIL as written in the text of the paper", default=TRUE) p <- add_argument(p, "--mili", help="MIL as shonw in the Figure of the paper", default=FALSE) p <- add_argument(p, "--milthres", help="MIL threshold to the sigmoid function for over-sampling", default=0.4) p <- add_argument(p, "--steepness", help="steepness of the sigmoid function", default=0.7) p <- add_argument(p, "--midpoint", help="midpoint of the sigmoid function (in days)", default=11) p <- add_argument(p, "--fs", help="apply feature selection", default=FALSE) p <- add_argument(p, "--top", help="# of features to keep in feature selection", default=200) p <- add_argument(p, "--rer", help="rare events ratio of the target event frequency", default=0.5) p <- add_argument(p, "--fer", help="frequent events ratio of the frequency of the most frequent event", default=0.8) p <- add_argument(p, "--milw", help="MIL window size (in days)", default=6) p <- add_argument(p, "--pthres", help="prediction threshold to the Risk value for a true positive episode", default=0.5) p <- add_argument(p, "--seed", help="seed for RF", default=500) p <- add_argument(p, "--csv", help="output for csv", default=TRUE) p <- add_argument(p, "--spme", help="export datasets for sequential pattern minning", default=FALSE) p <- add_argument(p, "--java", help="the java path", default="/usr/bin/java") p <- add_argument(p, "--python", help="the java path", default="/usr/bin/python") p <- add_argument(p, "--cep", help="the java path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spm_rules.py") p <- add_argument(p, "--spmf", help="the spmf path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spmf.jar") p <- add_argument(p, "--conf", help="minimum support (minsup)", default="20%") p <- add_argument(p, "--minti", help="minimum time interval allowed between two succesive itemsets of a sequential pattern", default=1) p <- add_argument(p, "--maxti", help="maximum time interval allowed between two succesive itemsets of a sequential pattern", default=5) p <- add_argument(p, "--minwi", help="minimum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=1) p <- add_argument(p, "--maxwi", help="maximum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=11) p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1) p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=22) argv = data.frame() if( length(commandArgs(trailingOnly = TRUE)) != 0){ argv <- parse_args(p) } else { argv <- parse_args(p,c(1,"training_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv","testing_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv",151,151)) } TEST_DATA = FALSE id = argv$id REMOVE_RARE_EVENTS = argv$rre REMOVE_FREQUENT_EVENTS = argv$rfe KEEP_ONLY_FIRST_OCCURENESS = argv$kofe MULTI_INSTANCE_LEARNING_TEXT = argv$milt #MIL as explained in the text MULTI_INSTANCE_LEARNING_IMAGE = argv$mili #MIL as presented in the figure FEATURE_SELECTION = argv$fs top_features = argv$top target_event_frequency_proportion_rare = argv$rer max_event_frequency_proportion_frequent = argv$fer milw = argv$milw F_thres = argv$milthres s = argv$steepness midpoint = argv$midpoint target_event = argv$tet b_length = argv$fet acceptance_threshold = argv$pthres export_spm = argv$spme seed = argv$seed csv = argv$csv max_warning_interval = argv$maxwint min_warning_interval = argv$minwint training_set = read_dataset(argv$train) test_set = read_dataset(argv$test) episodes_list <- preprocess(training_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres) #merge episodes merged_episodes = ldply(episodes_list, data.frame) merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))] if(FEATURE_SELECTION){ #remove columns with all values equal to zero merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0] merged_episodes = feature_selection(merged_episodes,top_features) } TEST_DATA = TRUE REMOVE_RARE_EVENTS = FALSE REMOVE_FREQUENT_EVENTS = FALSE KEEP_ONLY_FIRST_OCCURENESS = FALSE MULTI_INSTANCE_LEARNING_TEXT = FALSE #MIL as explained in the text MULTI_INSTANCE_LEARNING_IMAGE = FALSE #MIL as presented in the figure FEATURE_SELECTION = FALSE test_episodes_list <- preprocess(test_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres) my.rf = eval(merged_episodes,test_episodes_list,seed) # for(s in (0:6)){ # my.rf = eval(merged_episodes,test_episodes_list,seed) # seed = seed + 1 # } # for(ep in 1:length(test_episodes_list)){ # jpeg(paste(ep,'_rplot.jpg')) # plot(test_episodes_list,ep,my.rf) # dev.off() # } if(export_spm){ if(!csv){ print("~~~~~~~SEQUENTIAL PATTERN MINING~~~~~~~") } spm_train_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$train) spm_test_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$test) spm_results_path = gsub(".csv",paste("_results_",id,".csv",sep=""),argv$train) confidence = argv$conf min_dist_seq = argv$minti max_dist_seq = argv$maxti min_dist_first_last = argv$minwi max_dist_first_last = argv$maxwi java_path = argv$java jspmf_path = argv$spmf python_path = argv$python cep_path = argv$cep max_warning_interval = argv$maxwint min_warning_interval = argv$minwint export_ds_for_spm(target_event,episodes_list,spm_train_path) export_ds_for_spm(target_event,test_episodes_list,spm_test_path) if (file.exists(spm_results_path)) { invisible(file.remove(spm_results_path)) } javaOutput <- system(paste(java_path,"-jar",jspmf_path,"run HirateYamana",spm_train_path,spm_results_path,confidence,min_dist_seq,max_dist_seq,min_dist_first_last,max_dist_first_last), intern = TRUE) #print(javaOutput) pythonOutput <- system(paste(python_path,cep_path,spm_results_path,spm_test_path,target_event), intern = TRUE) #print(pythonOutput) true_positives = 0 false_positives = 0 false_negatives = 0 total_failures = 0 d = 0 warnings = list() for(w in pythonOutput){ d = as.integer(str_extract(w, "\\-*\\d+\\.*\\d*")) if(!grepl("Failure",w,fixed=TRUE)){ warnings = c(warnings,d) } else { total_failures = total_failures + 1 if(length(warnings) == 0){ false_negatives = false_negatives + 1 } else { if(length(warnings[warnings < d-max_warning_interval]) > 0){ false_positives = false_positives + length(warnings[warnings < d-max_warning_interval]) } if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){ true_positives = true_positives + 1 } else { false_negatives = false_negatives + 1 } } warnings = list() } } precision = true_positives/(true_positives+false_positives) if((true_positives+false_positives) == 0){ precision = 0 } recall = true_positives/total_failures F1 = 2*((precision*recall)/(precision+recall)) if((precision+recall) == 0){ F1 = 0 } if(!csv){ cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\n")) } else { cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$conf,",",argv$minti,",",argv$maxti,",",argv$minwi,",",argv$maxwi,",",argv$minwint,",",argv$maxwint, "\n",sep="")) } }