suppressMessages(library(CORElearn)) suppressMessages(library(dplyr)) suppressMessages(library(plyr)) suppressMessages(library(data.table)) suppressMessages(library(randomForest)) suppressMessages(library(ggplot2)) suppressMessages(library(grid)) suppressMessages(library(argparser)) suppressMessages(library(arules)) suppressMessages(library(arulesSequences)) suppressMessages(library(xgboost)) #convert event vectors to binary vectors compute_frequency_vectors <- function(aggr_episode_df,b_length){ freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+1, nrow = 0)) x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = ""))) # colnames(bin_aggr_episode_df) <- x for(i in 1:nrow(aggr_episode_df)) { #init a vector with 3405 0s freq_vector = as.vector(integer(b_length)) seg <- aggr_episode_df[i,] #if segment contains the j number, replace the 0 in the bin_vector with 1 for(value in seg$x[[1]]){ freq_vector[[value]] = length(which(seg$x[[1]] == value)) } #add a new line to the bin_aggr_epissode_df #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types date = as.Date(seg$Group.1[[1]]) new_df = data.frame(matrix(c(date, freq_vector),nrow=1,ncol=b_length+1)) freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df) } colnames(freq_aggr_episode_df) <- x return(freq_aggr_episode_df) } create_episodes_list <- function(ds2years,b_length){ if(!csv){ print("~~~~~~~CREATING FREQUENCY VECTORS~~~~~~~") } #devide in episodes #a list with data.frames for the episodes (each episode one data.frame) episodes_list = list() #data.frame for episodes episode_df <- data.frame(Timestamps=as.Date(character()),Event_id=integer()) #iterate over every line of the original dataset for(i in 1:nrow(ds2years)) { #get the current row of the ds meas <- ds2years[i,] episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id)) } aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(as.Date(episode_df$Timestamps, "%Y-%m-%d"))) #binarize the frequncy vector frequency_day_vectors = compute_frequency_vectors(aggr_episode_df,b_length) #episodes_list = split(frequency_day_vectors, (as.numeric(rownames(frequency_day_vectors))-1) %/% 50) #add the episode to the episodes_list #episodes_list[[length(episodes_list)+1]] = frequency_day_vectors return(frequency_day_vectors) } read_dataset <- function(path){ dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#") dataset[, 2] <- as.numeric(dataset[, 2]) return(dataset) } # Function returns the Jaccard distance #https://stats.stackexchange.com/questions/176613/jaccard-similarity-in-r #The function takes two arguments: x a dataframe or matrix object, and m the MARGIN argument used in the apply function. If your data is in wide format set m = 2 to apply sum over the columns. If your data is in long format set m = 1 to apply sum over the rows jaccard <- function(df, margin) { if (margin == 1 | margin == 2) { M_00 <- apply(df, margin, sum) == 0 M_11 <- apply(df, margin, sum) == 2 if (margin == 1) { df <- df[!M_00, ] JSim <- sum(M_11) / nrow(df) } else { df <- df[, !M_00] JSim <- sum(M_11) / length(df) } JDist <- 1 - JSim return(JDist) } else break } label_OW <- function(index,frequency_day_vectors,target_event,X,M,Y,Z,N,test){ #check the label of the OW PW = data.frame() if(test){ PW = frequency_day_vectors[(index-1+X*M):(index+X*M+Z+Y),] } else { if(Z != 0){ BW = frequency_day_vectors[(index-1+((X*M))):(index+((X*M))+Z),] if(sum(BW[paste("e_",target_event,sep="")]) > 0){ return(NULL) } } PW = frequency_day_vectors[(index-1+X*M+Z):(index+X*M+Z+Y),] } return(sum(PW[paste("e_",target_event,sep="")]) > 0) } # label_OW <- function(index,frequency_day_vectors,target_event,X,M,Y,Z,N,test){ # #check the label of the OW # PW = data.frame() # if(test){ # PW = frequency_day_vectors[(index+X*M+index-1):(index+X*M+index-1+Z+Y),] # } else { # if(Z != 0){ # BW = frequency_day_vectors[(index+((X*M)+(index-1))):(index+((X*M)+(index-1))+Z),] # if(sum(BW[paste("e_",target_event,sep="")]) > 0){ # return(NULL) # } # } # PW = frequency_day_vectors[(index+X*M+index-1+Z):(index+X*M+index-1+Z+Y),] # } # return(sum(PW[paste("e_",target_event,sep="")]) > 0) # } getFrequentPatternsPaperConf <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){ test = FALSE events_list <- list() for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days freq = sapply(OW, sum) events = names(freq[freq>0]) events_list[length(events_list)+1] = list(events) } names(events_list) <- paste("OW",c(1:length(events_list)), sep = "") trans1 <- as(events_list, "transactions") freq_patterns <- apriori(trans1, parameter = list(supp=support,target="frequent",maxtime=5,maxlen=10, minlen=1),control=list(verbose = FALSE)) if(length(items(freq_patterns)) == 0){ return(items(freq_patterns)) } total_freq = integer(length(items(freq_patterns))) positive_freq = integer(length(items(freq_patterns))) for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } #check the label of the OW Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test) if(is.null(Label)){ if(!csv){ print("Error in BW moving to the next OW") } next } OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days logical_v = items(freq_patterns) %in% colnames(OW[, colSums(OW != 0) > 0]) for(i in 1:length(logical_v)){ if(logical_v[i]){ if(Label){ positive_freq[i] = positive_freq[i] + 1 } total_freq[i] = total_freq[i] + 1 } } } conf = positive_freq/total_freq return(items(freq_patterns)[conf>=conf_level]) } getFrequentPatternsCustom <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){ test = FALSE events_list <- list() for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } #check the label of the OW Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test) if(is.null(Label)){ if(!csv){ print("Error in BW moving to the next OW") } next } #only for the True OWs if(Label){ OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days freq = sapply(OW, sum) events = names(freq[freq>0]) events_list[length(events_list)+1] = list(events) } } names(events_list) <- paste("OW",c(1:length(events_list)), sep = "") trans1 <- as(events_list, "transactions") freq_patterns <- apriori(trans1, parameter = list(supp=support,target="frequent",maxtime=5,maxlen=10, minlen=1),control=list(verbose = FALSE)) if(length(items(freq_patterns)) == 0){ return(items(freq_patterns)) } total_freq = integer(length(items(freq_patterns))) for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days logical_v = items(freq_patterns) %in% colnames(OW[, colSums(OW != 0) > 0]) for(i in 1:length(logical_v)){ if(logical_v[i]){ total_freq[i] = total_freq[i] + 1 } } } conf = quality(freq_patterns)$count/total_freq return(items(freq_patterns)[conf>=conf_level]) } getFrequentPatterns <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){ if(PATTERN_CUSTOM){ return(getFrequentPatternsCustom(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support)) } else { return(getFrequentPatternsPaperConf(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support)) } } get_first_positive_OW <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,test){ for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } #check the label of the OW Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test) if(is.null(Label)){ if(!csv){ print("moving to the next") } next } if(Label){ return(frequency_day_vectors[i:((X*M)+(i-1)),]) #subset by row, get the X*M days } } } compute_basic_statistic_features <- function(instance,OW){ #compute event frequency per sub-window OW = split(OW, factor(sort(rank(row.names(OW))%%X))) # D <- data.frame(matrix(ncol = b_length, nrow = 0)) # colnames(D) <- c(c(paste("e_",c(1:b_length),sep = ""))) for(j in 1:length(OW)){ SW = OW[j] freq = sapply(SW[[1]], sum) names(freq) = paste(names(freq),paste("_freq_",j,sep=""),sep="") instance = c(instance,freq) } return(instance) } compute_advanced_statistic_features <- function(OW){ #compute error interval V <- setNames(as.list(rep(0,(2*length(names(OW))))), c(paste(names(OW),"_meanV",sep=""), paste(names(OW),"_stdV",sep=""))) for(e in 1:length(OW)){ event = OW[e] error_interval = c() event_indeces = which(event > 0) for(d in length(event_indeces):2){ error_interval[length(error_interval)+1] = event_indeces[d] - event_indeces[d-1] } meanV = mean(error_interval) meanV = if(is.na(meanV)) 0 else meanV stdV = sd(error_interval) stdV = if(is.na(stdV)) 0 else stdV V[paste(names(event),"_meanV",sep="")] = meanV V[paste(names(event),"_stdV",sep="")] = stdV } #compute distance from Prediction Point D <- data.frame(matrix(ncol = b_length, nrow = 0)) for(d in 1:nrow(OW)){ error_instance_distance = nrow(OW)-d+1 day = OW[d,] day[1:ncol(day)][day[1:ncol(day)] > 0] = error_instance_distance D <- rbind(D,day) } min = sapply(D,min) names(min) = paste(names(min),"_minD",sep="") max = sapply(D,max) names(max) = paste(names(max),"_maxD",sep="") mean = sapply(D,mean) names(mean) = paste(names(mean),"_meanD",sep="") return(c(min,max,mean,V)) } init <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup){ instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+1), nrow = 0)) if(PATTERN_FEATURES && SIMILARITY_FEATURE){ if(!test){ freq_pattern_items <<- getFrequentPatterns(frequency_day_vectors,target_event,X,M,Y,Z,N,conf,sup) print(paste("pattern items length: ",length(freq_pattern_items))) } instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+length(freq_pattern_items)+1+1), nrow = 0)) } else if(PATTERN_FEATURES){ if(!test){ freq_pattern_items <<- getFrequentPatterns(frequency_day_vectors,target_event,X,M,Y,Z,N,conf,sup) print(paste("pattern items length: ",length(freq_pattern_items))) } instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+length(freq_pattern_items)+1), nrow = 0)) } else if(SIMILARITY_FEATURE){ instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+1+1), nrow = 0)) } if(!test && SIMILARITY_FEATURE){ first_positive_OW <<- get_first_positive_OW(frequency_day_vectors,target_event,X,M,Y,Z,N,test) } if(PATTERN_FEATURES && length(freq_pattern_items) <= 0){ if(!csv){ print("WARNING: No patterns found") } } return(instances_df) } compute_similarity_feature <- function(SIMILARITY_FEATURE,OW){ J=list() if(SIMILARITY_FEATURE){ J = setNames(list(0),list("jaccard")) J[1] = jaccard(data.frame(OW,first_positive_OW),2) } return(J) } compute_pattern_features <- function(PATTERN_FEATURES,OW){ P=list() if(PATTERN_FEATURES && length(freq_pattern_items) > 0){ P = setNames(as.list(rep(0,length(freq_pattern_items))),as.list(paste("p_",labels(freq_pattern_items),sep=""))) logical_v = freq_pattern_items %in% colnames(OW[, colSums(OW != 0) > 0]) for(l in 1:length(logical_v)){ if(logical_v[l]){ P[l] = 1 } } } return(P) } create_instances <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup){ if(!csv){ print("~~~~~~~CREATING INSTANCES~~~~~~~") } frequency_day_vectors = frequency_day_vectors[ , !(names(frequency_day_vectors) %in% c("Timestamps"))] instances_df = init(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup); for(i in seq(1,nrow(frequency_day_vectors), by=N)){ #if the end of PW is equal to the total days then stop if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){ break } #subset by row, get the X*M days OW = frequency_day_vectors[i:((X*M)+(i-1)),] instance = compute_pattern_features(PATTERN_FEATURES,OW) instance = c(instance,compute_similarity_feature(SIMILARITY_FEATURE,OW)) instance = c(instance,compute_advanced_statistic_features(OW)) instance = compute_basic_statistic_features(instance,OW) instance_df = as.data.frame(instance) #check the label of the OW Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test) if(is.null(Label)){ if(!csv){ print("moving to the next") } next } instance_df$Label = Label instances_df = rbind(instances_df,instance_df) if(!csv){ print(paste("Created instances: ", nrow(instances_df),sep="")) } } return(instances_df) } feature_selection <- function(instances_df,test_instances_df,step,failure_incidents){ if(!csv){ print("~~~~~~~APPLYING FEATURE SELECTION~~~~~~~") } # estReliefF <- attrEval(Label ~ ., instances_df, estimator="ReliefFexpRank", ReliefIterations=50) # sorted_indeces = order(estReliefF, decreasing = TRUE) # instances_df = instances_df %>% select(sorted_indeces[1:top],ncol(instances_df)) run = TRUE i = length(instances_df)-1 max_F1 = 0 max_instances_df = data.frame() while(run){ estReliefF <- attrEval(Label ~ ., instances_df, estimator="ReliefFexpRank", ReliefIterations=50) sorted_indeces = order(estReliefF, decreasing = TRUE) instances_df = instances_df %>% select(sorted_indeces[1:i],ncol(instances_df)) #print(length(instances_df)) F1 = evalXGBoost(instances_df,test_instances_df,failure_incidents,TRUE) if(max_F1 == 0){ max_F1 = F1 max_instances_df = instances_df } else if(F1>max_F1){ # else if((F1-max_F1)/F1 >= 0.2){ max_F1 = F1 max_instances_df = instances_df } i = i - step if(i <= 0){ run = FALSE } } return(max_instances_df) } compute_last_time_point_of_OW <- function(index){ OW_length = X*M return(OW_length+((index-1)*N)) } eval_predictions <- function(Prediction,failure_incidents,fs=FALSE){ predictions = list() for(p in 1:length(Prediction)){ d = compute_last_time_point_of_OW(p) if(Prediction[p] == "TRUE"){ predictions = c(predictions,d) } } true_positives = 0 false_positives = 0 false_negatives = 0 for(i in 1:length(failure_incidents)){ d = failure_incidents[i] warnings = list() if(i == 1){ warnings = predictions[predictions <= d] } else { warnings = predictions[predictions > failure_incidents[i-1] & predictions <= d] } if(length(warnings) == 0){ false_negatives = false_negatives + 1 } else { if(length(warnings[warnings < d-max_warning_interval]) > 0){ false_positives = false_positives + length(warnings[warnings < d-max_warning_interval]) } if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){ true_positives = true_positives + 1 } else { false_negatives = false_negatives + 1 } } } precision = true_positives/(true_positives+false_positives) if((true_positives+false_positives) == 0){ precision = 0 } recall = true_positives/length(failure_incidents) F1 = 2*((precision*recall)/(precision+recall)) if((precision+recall) == 0){ F1 = 0 } if(!fs){ if(!csv){ cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1,"\n")) } else{ cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$fet,",",argv$tet,",",argv$X,",",argv$M,",",argv$Y,",",argv$Z,",",argv$N,",",argv$step,",",argv$sup,",",argv$conf,",",argv$fs,",",argv$pf,",",argv$sf,",",argv$plogic,",",argv$minwint,",",argv$maxwint, "\n",sep="")) } } return(F1) } evalXGBoost <- function(instances_df,test_instances_df,failure_incidents,fs=FALSE){ set.seed(500) instances_df = instances_df[ , order(names(instances_df))] my.rf <- xgboost(data = as.matrix(instances_df[ , !(names(instances_df) %in% c("Label"))]), label = as.integer(as.logical(instances_df$Label)), nthread = 8, eta=0.6, nrounds = 10, objective = "binary:logistic",verbose = 0) test_instances_df = test_instances_df[ , (names(test_instances_df) %in% names(instances_df))] test_instances_df = test_instances_df[ , order(names(test_instances_df))] dtest <- xgb.DMatrix(data = as.matrix(test_instances_df[ , !(names(test_instances_df) %in% c("Label"))]), label=as.integer(as.logical(test_instances_df$Label))) Prediction <- predict(my.rf, dtest) Prediction <- as.logical(as.numeric(Prediction > 0.5)) return(eval_predictions(Prediction,failure_incidents,fs)) } eval <- function(instances_df,test_instances_df,failure_incidents,plot){ set.seed(500) my.rf = randomForest(Label ~ .,data=instances_df,importance=TRUE) if(plot){ varImpPlot(my.rf) } Prediction <- predict(my.rf, test_instances_df[ , !(names(test_instances_df) %in% c("Label"))]) return(eval_predictions(Prediction,failure_incidents)) } p <- arg_parser("Implementation of the IBMs ATM Predictor") # Add a positional argument p <- add_argument(p, "train", help="training dataset") p <- add_argument(p, "test", help="test dataset") p <- add_argument(p, "fet", help="different types of the fault events",default=151) p <- add_argument(p, "tet", help="type of the target fault events",default=151) p <- add_argument(p, "--X", help="# of segments/sub-windows", default=5) p <- add_argument(p, "--M", help="segment legth (in days)", default=4) p <- add_argument(p, "--Y", help="length of the prediction window (in days)", default=30) p <- add_argument(p, "--Z", help="length of the buffer window (in days)", default=0) p <- add_argument(p, "--N", help="moving step (in days)", default=20) p <- add_argument(p, "--step", help="feature selection decrease step", default=200) p <- add_argument(p, "--sup", help="pattern features appriori support", default=0.8) p <- add_argument(p, "--conf", help="pattern features confidence", default=0.6) p <- add_argument(p, "--fs", help="apply feature selection", default=TRUE) p <- add_argument(p, "--pf", help="use pattern features", default=FALSE) p <- add_argument(p, "--sf", help="use similarity feature", default=TRUE) p <- add_argument(p, "--plogic", help="TRUE for custom pattern detection logic FALSE for paper's logic", default=TRUE) p <- add_argument(p, "--csv", help="output for csv", default=FALSE) p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1) p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=22) argv = data.frame() if( length(commandArgs(trailingOnly = TRUE)) != 0){ argv <- parse_args(p) } else { argv <- parse_args(p,c("training_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv","testing_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv",151,151)) } training_set = read_dataset(argv$train) test_set = read_dataset(argv$test) b_length = argv$fet target_event = argv$tet X = argv$X M = argv$M Y = argv$Y Z = argv$Z N = argv$N step = argv$step sup = argv$sup conf = argv$conf FEATURE_SELECTION = argv$fs PATTERN_FEATURES = argv$pf JACCARD_FEATURE = argv$sf PATTERN_CUSTOM = argv$plogic csv = argv$csv max_warning_interval = argv$maxwint min_warning_interval = argv$minwint frequency_day_vectors = create_episodes_list(training_set,b_length) instances_df = create_instances(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,JACCARD_FEATURE,FALSE,conf,sup) #remove columns with all values equal to zero label = instances_df$Label instances_df = instances_df[, colSums(instances_df != 0) > 0] instances_df$Label = as.factor(label) test_frequency_day_vectors = create_episodes_list(test_set,b_length) test_instances_df = create_instances(test_frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,JACCARD_FEATURE,TRUE,conf,sup) test_instances_df$Label = as.factor(test_instances_df$Label) failure_incidents = which(matrix(grepl(1, test_frequency_day_vectors[,paste("e_",target_event,sep="")]),ncol=1),arr.ind=TRUE)[,1] if(FEATURE_SELECTION){ print(length(instances_df)) instances_df = feature_selection(instances_df,test_instances_df,step,failure_incidents) print(length(instances_df)) } resultXGBOOST = evalXGBoost(instances_df,test_instances_df,failure_incidents) resultRF = eval(instances_df,test_instances_df,failure_incidents,FALSE)