suppressMessages(library(CORElearn))
suppressMessages(library(dplyr))
suppressMessages(library(plyr))
suppressMessages(library(data.table))
suppressMessages(library(randomForest))
suppressMessages(library(ggplot2))
suppressMessages(library(grid))
suppressMessages(library(argparser))
suppressMessages(library(arules))
suppressMessages(library(arulesSequences))
suppressMessages(library(xgboost))
#convert event vectors to binary vectors
compute_frequency_vectors <- function(aggr_episode_df,b_length){
freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+1, nrow = 0))
x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")))
# colnames(bin_aggr_episode_df) <- x
for(i in 1:nrow(aggr_episode_df)) {
#init a vector with 3405 0s
freq_vector = as.vector(integer(b_length))
seg <- aggr_episode_df[i,]
#if segment contains the j number, replace the 0 in the bin_vector with 1
for(value in seg$x[[1]]){
freq_vector[[value]] = length(which(seg$x[[1]] == value))
}
#add a new line to the bin_aggr_epissode_df
#we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types
date = as.Date(seg$Group.1[[1]])
new_df = data.frame(matrix(c(date, freq_vector),nrow=1,ncol=b_length+1))
freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df)
}
colnames(freq_aggr_episode_df) <- x
return(freq_aggr_episode_df)
}
create_episodes_list <- function(ds2years,b_length){
if(!csv){
print("~~~~~~~CREATING FREQUENCY VECTORS~~~~~~~")
}
#devide in episodes
#a list with data.frames for the episodes (each episode one data.frame)
episodes_list = list()
#data.frame for episodes
episode_df <- data.frame(Timestamps=as.Date(character()),Event_id=integer())
#iterate over every line of the original dataset
for(i in 1:nrow(ds2years)) {
#get the current row of the ds
meas <- ds2years[i,]
episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id))
}
aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(as.Date(episode_df$Timestamps, "%Y-%m-%d")))
#binarize the frequncy vector
frequency_day_vectors = compute_frequency_vectors(aggr_episode_df,b_length)
#episodes_list = split(frequency_day_vectors, (as.numeric(rownames(frequency_day_vectors))-1) %/% 50)
#add the episode to the episodes_list
#episodes_list[[length(episodes_list)+1]] = frequency_day_vectors
return(frequency_day_vectors)
}
read_dataset <- function(path){
dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
dataset[, 2] <- as.numeric(dataset[, 2])
return(dataset)
}
# Function returns the Jaccard distance
#https://stats.stackexchange.com/questions/176613/jaccard-similarity-in-r
#The function takes two arguments: x a dataframe or matrix object, and m the MARGIN argument used in the apply function. If your data is in wide format set m = 2 to apply sum over the columns. If your data is in long format set m = 1 to apply sum over the rows
jaccard <- function(df, margin) {
if (margin == 1 | margin == 2) {
M_00 <- apply(df, margin, sum) == 0
M_11 <- apply(df, margin, sum) == 2
if (margin == 1) {
df <- df[!M_00, ]
JSim <- sum(M_11) / nrow(df)
} else {
df <- df[, !M_00]
JSim <- sum(M_11) / length(df)
}
JDist <- 1 - JSim
return(JDist)
} else break
}
label_OW <- function(index,frequency_day_vectors,target_event,X,M,Y,Z,N,test){
#check the label of the OW
PW = data.frame()
if(test){
PW = frequency_day_vectors[(index-1+X*M):(index+X*M+Z+Y),]
} else {
if(Z != 0){
BW = frequency_day_vectors[(index-1+((X*M))):(index+((X*M))+Z),]
if(sum(BW[paste("e_",target_event,sep="")]) > 0){
return(NULL)
}
}
PW = frequency_day_vectors[(index-1+X*M+Z):(index+X*M+Z+Y),]
}
return(sum(PW[paste("e_",target_event,sep="")]) > 0)
}
# label_OW <- function(index,frequency_day_vectors,target_event,X,M,Y,Z,N,test){
# #check the label of the OW
# PW = data.frame()
# if(test){
# PW = frequency_day_vectors[(index+X*M+index-1):(index+X*M+index-1+Z+Y),]
# } else {
# if(Z != 0){
# BW = frequency_day_vectors[(index+((X*M)+(index-1))):(index+((X*M)+(index-1))+Z),]
# if(sum(BW[paste("e_",target_event,sep="")]) > 0){
# return(NULL)
# }
# }
# PW = frequency_day_vectors[(index+X*M+index-1+Z):(index+X*M+index-1+Z+Y),]
# }
# return(sum(PW[paste("e_",target_event,sep="")]) > 0)
# }
getFrequentPatternsPaperConf <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){
test = FALSE
events_list <- list()
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days
freq = sapply(OW, sum)
events = names(freq[freq>0])
events_list[length(events_list)+1] = list(events)
}
names(events_list) <- paste("OW",c(1:length(events_list)), sep = "")
trans1 <- as(events_list, "transactions")
freq_patterns <- apriori(trans1, parameter = list(supp=support,target="frequent",maxtime=5,maxlen=10, minlen=1),control=list(verbose = FALSE))
if(length(items(freq_patterns)) == 0){
return(items(freq_patterns))
}
total_freq = integer(length(items(freq_patterns)))
positive_freq = integer(length(items(freq_patterns)))
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
#check the label of the OW
Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test)
if(is.null(Label)){
if(!csv){
print("Error in BW moving to the next OW")
}
next
}
OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days
logical_v = items(freq_patterns) %in% colnames(OW[, colSums(OW != 0) > 0])
for(i in 1:length(logical_v)){
if(logical_v[i]){
if(Label){
positive_freq[i] = positive_freq[i] + 1
}
total_freq[i] = total_freq[i] + 1
}
}
}
conf = positive_freq/total_freq
return(items(freq_patterns)[conf>=conf_level])
}
getFrequentPatternsCustom <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){
test = FALSE
events_list <- list()
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
#check the label of the OW
Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test)
if(is.null(Label)){
if(!csv){
print("Error in BW moving to the next OW")
}
next
}
#only for the True OWs
if(Label){
OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days
freq = sapply(OW, sum)
events = names(freq[freq>0])
events_list[length(events_list)+1] = list(events)
}
}
names(events_list) <- paste("OW",c(1:length(events_list)), sep = "")
trans1 <- as(events_list, "transactions")
freq_patterns <- apriori(trans1, parameter = list(supp=support,target="frequent",maxtime=5,maxlen=10, minlen=1),control=list(verbose = FALSE))
if(length(items(freq_patterns)) == 0){
return(items(freq_patterns))
}
total_freq = integer(length(items(freq_patterns)))
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
OW = frequency_day_vectors[i:((X*M)+(i-1)),] #subset by row, get the X*M days
logical_v = items(freq_patterns) %in% colnames(OW[, colSums(OW != 0) > 0])
for(i in 1:length(logical_v)){
if(logical_v[i]){
total_freq[i] = total_freq[i] + 1
}
}
}
conf = quality(freq_patterns)$count/total_freq
return(items(freq_patterns)[conf>=conf_level])
}
getFrequentPatterns <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support){
if(PATTERN_CUSTOM){
return(getFrequentPatternsCustom(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support))
} else {
return(getFrequentPatternsPaperConf(frequency_day_vectors,target_event,X,M,Y,Z,N,conf_level,support))
}
}
get_first_positive_OW <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,test){
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
#check the label of the OW
Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test)
if(is.null(Label)){
if(!csv){
print("moving to the next")
}
next
}
if(Label){
return(frequency_day_vectors[i:((X*M)+(i-1)),]) #subset by row, get the X*M days
}
}
}
compute_basic_statistic_features <- function(instance,OW){
#compute event frequency per sub-window
OW = split(OW, factor(sort(rank(row.names(OW))%%X)))
# D <- data.frame(matrix(ncol = b_length, nrow = 0))
# colnames(D) <- c(c(paste("e_",c(1:b_length),sep = "")))
for(j in 1:length(OW)){
SW = OW[j]
freq = sapply(SW[[1]], sum)
names(freq) = paste(names(freq),paste("_freq_",j,sep=""),sep="")
instance = c(instance,freq)
}
return(instance)
}
compute_advanced_statistic_features <- function(OW){
#compute error interval
V <- setNames(as.list(rep(0,(2*length(names(OW))))), c(paste(names(OW),"_meanV",sep=""), paste(names(OW),"_stdV",sep="")))
for(e in 1:length(OW)){
event = OW[e]
error_interval = c()
event_indeces = which(event > 0)
for(d in length(event_indeces):2){
error_interval[length(error_interval)+1] = event_indeces[d] - event_indeces[d-1]
}
meanV = mean(error_interval)
meanV = if(is.na(meanV)) 0 else meanV
stdV = sd(error_interval)
stdV = if(is.na(stdV)) 0 else stdV
V[paste(names(event),"_meanV",sep="")] = meanV
V[paste(names(event),"_stdV",sep="")] = stdV
}
#compute distance from Prediction Point
D <- data.frame(matrix(ncol = b_length, nrow = 0))
for(d in 1:nrow(OW)){
error_instance_distance = nrow(OW)-d+1
day = OW[d,]
day[1:ncol(day)][day[1:ncol(day)] > 0] = error_instance_distance
D <- rbind(D,day)
}
min = sapply(D,min)
names(min) = paste(names(min),"_minD",sep="")
max = sapply(D,max)
names(max) = paste(names(max),"_maxD",sep="")
mean = sapply(D,mean)
names(mean) = paste(names(mean),"_meanD",sep="")
return(c(min,max,mean,V))
}
init <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup){
instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+1), nrow = 0))
if(PATTERN_FEATURES && SIMILARITY_FEATURE){
if(!test){
freq_pattern_items <<- getFrequentPatterns(frequency_day_vectors,target_event,X,M,Y,Z,N,conf,sup)
print(paste("pattern items length: ",length(freq_pattern_items)))
}
instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+length(freq_pattern_items)+1+1), nrow = 0))
} else if(PATTERN_FEATURES){
if(!test){
freq_pattern_items <<- getFrequentPatterns(frequency_day_vectors,target_event,X,M,Y,Z,N,conf,sup)
print(paste("pattern items length: ",length(freq_pattern_items)))
}
instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+length(freq_pattern_items)+1), nrow = 0))
} else if(SIMILARITY_FEATURE){
instances_df = data.frame(matrix(ncol = (ncol(frequency_day_vectors)*5+ncol(frequency_day_vectors)*X+1+1), nrow = 0))
}
if(!test && SIMILARITY_FEATURE){
first_positive_OW <<- get_first_positive_OW(frequency_day_vectors,target_event,X,M,Y,Z,N,test)
}
if(PATTERN_FEATURES && length(freq_pattern_items) <= 0){
if(!csv){
print("WARNING: No patterns found")
}
}
return(instances_df)
}
compute_similarity_feature <- function(SIMILARITY_FEATURE,OW){
J=list()
if(SIMILARITY_FEATURE){
J = setNames(list(0),list("jaccard"))
J[1] = jaccard(data.frame(OW,first_positive_OW),2)
}
return(J)
}
compute_pattern_features <- function(PATTERN_FEATURES,OW){
P=list()
if(PATTERN_FEATURES && length(freq_pattern_items) > 0){
P = setNames(as.list(rep(0,length(freq_pattern_items))),as.list(paste("p_",labels(freq_pattern_items),sep="")))
logical_v = freq_pattern_items %in% colnames(OW[, colSums(OW != 0) > 0])
for(l in 1:length(logical_v)){
if(logical_v[l]){
P[l] = 1
}
}
}
return(P)
}
create_instances <- function(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup){
if(!csv){
print("~~~~~~~CREATING INSTANCES~~~~~~~")
}
frequency_day_vectors = frequency_day_vectors[ , !(names(frequency_day_vectors) %in% c("Timestamps"))]
instances_df = init(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,SIMILARITY_FEATURE,test,conf,sup);
for(i in seq(1,nrow(frequency_day_vectors), by=N)){
#if the end of PW is equal to the total days then stop
if((i+X*M+Z+Y) > nrow(frequency_day_vectors)){
break
}
#subset by row, get the X*M days
OW = frequency_day_vectors[i:((X*M)+(i-1)),]
instance = compute_pattern_features(PATTERN_FEATURES,OW)
instance = c(instance,compute_similarity_feature(SIMILARITY_FEATURE,OW))
instance = c(instance,compute_advanced_statistic_features(OW))
instance = compute_basic_statistic_features(instance,OW)
instance_df = as.data.frame(instance)
#check the label of the OW
Label = label_OW(i,frequency_day_vectors,target_event,X,M,Y,Z,N,test)
if(is.null(Label)){
if(!csv){
print("moving to the next")
}
next
}
instance_df$Label = Label
instances_df = rbind(instances_df,instance_df)
if(!csv){
print(paste("Created instances: ", nrow(instances_df),sep=""))
}
}
return(instances_df)
}
feature_selection <- function(instances_df,test_instances_df,step,failure_incidents){
if(!csv){
print("~~~~~~~APPLYING FEATURE SELECTION~~~~~~~")
}
# estReliefF <- attrEval(Label ~ ., instances_df, estimator="ReliefFexpRank", ReliefIterations=50)
# sorted_indeces = order(estReliefF, decreasing = TRUE)
# instances_df = instances_df %>% select(sorted_indeces[1:top],ncol(instances_df))
run = TRUE
i = length(instances_df)-1
max_F1 = 0
max_instances_df = data.frame()
while(run){
estReliefF <- attrEval(Label ~ ., instances_df, estimator="ReliefFexpRank", ReliefIterations=50)
sorted_indeces = order(estReliefF, decreasing = TRUE)
instances_df = instances_df %>% select(sorted_indeces[1:i],ncol(instances_df))
#print(length(instances_df))
F1 = evalXGBoost(instances_df,test_instances_df,failure_incidents,TRUE)
if(max_F1 == 0){
max_F1 = F1
max_instances_df = instances_df
} else if(F1>max_F1){ # else if((F1-max_F1)/F1 >= 0.2){
max_F1 = F1
max_instances_df = instances_df
}
i = i - step
if(i <= 0){
run = FALSE
}
}
return(max_instances_df)
}
compute_last_time_point_of_OW <- function(index){
OW_length = X*M
return(OW_length+((index-1)*N))
}
eval_predictions <- function(Prediction,failure_incidents,fs=FALSE){
predictions = list()
for(p in 1:length(Prediction)){
d = compute_last_time_point_of_OW(p)
if(Prediction[p] == "TRUE"){
predictions = c(predictions,d)
}
}
true_positives = 0
false_positives = 0
false_negatives = 0
for(i in 1:length(failure_incidents)){
d = failure_incidents[i]
warnings = list()
if(i == 1){
warnings = predictions[predictions <= d]
} else {
warnings = predictions[predictions > failure_incidents[i-1] & predictions <= d]
}
if(length(warnings) == 0){
false_negatives = false_negatives + 1
} else {
if(length(warnings[warnings < d-max_warning_interval]) > 0){
false_positives = false_positives + length(warnings[warnings < d-max_warning_interval])
}
if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){
true_positives = true_positives + 1
} else {
false_negatives = false_negatives + 1
}
}
}
precision = true_positives/(true_positives+false_positives)
if((true_positives+false_positives) == 0){
precision = 0
}
recall = true_positives/length(failure_incidents)
F1 = 2*((precision*recall)/(precision+recall))
if((precision+recall) == 0){
F1 = 0
}
if(!fs){
if(!csv){
cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1,"\n"))
} else{
cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$fet,",",argv$tet,",",argv$X,",",argv$M,",",argv$Y,",",argv$Z,",",argv$N,",",argv$step,",",argv$sup,",",argv$conf,",",argv$fs,",",argv$pf,",",argv$sf,",",argv$plogic,",",argv$minwint,",",argv$maxwint, "\n",sep=""))
}
}
return(F1)
}
evalXGBoost <- function(instances_df,test_instances_df,failure_incidents,fs=FALSE){
set.seed(500)
instances_df = instances_df[ , order(names(instances_df))]
my.rf <- xgboost(data = as.matrix(instances_df[ , !(names(instances_df) %in% c("Label"))]), label = as.integer(as.logical(instances_df$Label)), nthread = 8, eta=0.6, nrounds = 10, objective = "binary:logistic",verbose = 0)
test_instances_df = test_instances_df[ , (names(test_instances_df) %in% names(instances_df))]
test_instances_df = test_instances_df[ , order(names(test_instances_df))]
dtest <- xgb.DMatrix(data = as.matrix(test_instances_df[ , !(names(test_instances_df) %in% c("Label"))]), label=as.integer(as.logical(test_instances_df$Label)))
Prediction <- predict(my.rf, dtest)
Prediction <- as.logical(as.numeric(Prediction > 0.5))
return(eval_predictions(Prediction,failure_incidents,fs))
}
eval <- function(instances_df,test_instances_df,failure_incidents,plot){
set.seed(500)
my.rf = randomForest(Label ~ .,data=instances_df,importance=TRUE)
if(plot){
varImpPlot(my.rf)
}
Prediction <- predict(my.rf, test_instances_df[ , !(names(test_instances_df) %in% c("Label"))])
return(eval_predictions(Prediction,failure_incidents))
}
p <- arg_parser("Implementation of the IBMs ATM Predictor")
# Add a positional argument
p <- add_argument(p, "train", help="training dataset")
p <- add_argument(p, "test", help="test dataset")
p <- add_argument(p, "fet", help="different types of the fault events",default=151)
p <- add_argument(p, "tet", help="type of the target fault events",default=151)
p <- add_argument(p, "--X", help="# of segments/sub-windows", default=5)
p <- add_argument(p, "--M", help="segment legth (in days)", default=4)
p <- add_argument(p, "--Y", help="length of the prediction window (in days)", default=30)
p <- add_argument(p, "--Z", help="length of the buffer window (in days)", default=0)
p <- add_argument(p, "--N", help="moving step (in days)", default=20)
p <- add_argument(p, "--step", help="feature selection decrease step", default=200)
p <- add_argument(p, "--sup", help="pattern features appriori support", default=0.8)
p <- add_argument(p, "--conf", help="pattern features confidence", default=0.6)
p <- add_argument(p, "--fs", help="apply feature selection", default=TRUE)
p <- add_argument(p, "--pf", help="use pattern features", default=FALSE)
p <- add_argument(p, "--sf", help="use similarity feature", default=TRUE)
p <- add_argument(p, "--plogic", help="TRUE for custom pattern detection logic FALSE for paper's logic", default=TRUE)
p <- add_argument(p, "--csv", help="output for csv", default=FALSE)
p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1)
p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=22)
argv = data.frame()
if( length(commandArgs(trailingOnly = TRUE)) != 0){
argv <- parse_args(p)
} else {
argv <- parse_args(p,c("training_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv","testing_dataset_150ft_151vl_1824d_90pc_50ppc_1minbt_5maxbt_1minbpe_2maxbpe_4pl_3minpf_4maxpf_2seed.csv",151,151))
}
training_set = read_dataset(argv$train)
test_set = read_dataset(argv$test)
b_length = argv$fet
target_event = argv$tet
X = argv$X
M = argv$M
Y = argv$Y
Z = argv$Z
N = argv$N
step = argv$step
sup = argv$sup
conf = argv$conf
FEATURE_SELECTION = argv$fs
PATTERN_FEATURES = argv$pf
JACCARD_FEATURE = argv$sf
PATTERN_CUSTOM = argv$plogic
csv = argv$csv
max_warning_interval = argv$maxwint
min_warning_interval = argv$minwint
frequency_day_vectors = create_episodes_list(training_set,b_length)
instances_df = create_instances(frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,JACCARD_FEATURE,FALSE,conf,sup)
#remove columns with all values equal to zero
label = instances_df$Label
instances_df = instances_df[, colSums(instances_df != 0) > 0]
instances_df$Label = as.factor(label)
test_frequency_day_vectors = create_episodes_list(test_set,b_length)
test_instances_df = create_instances(test_frequency_day_vectors,target_event,X,M,Y,Z,N,PATTERN_FEATURES,JACCARD_FEATURE,TRUE,conf,sup)
test_instances_df$Label = as.factor(test_instances_df$Label)
failure_incidents = which(matrix(grepl(1, test_frequency_day_vectors[,paste("e_",target_event,sep="")]),ncol=1),arr.ind=TRUE)[,1]
if(FEATURE_SELECTION){
print(length(instances_df))
instances_df = feature_selection(instances_df,test_instances_df,step,failure_incidents)
print(length(instances_df))
}
resultXGBOOST = evalXGBoost(instances_df,test_instances_df,failure_incidents)
resultRF = eval(instances_df,test_instances_df,failure_incidents,FALSE)