Commit f19350bf40493228d5a842e024a13b8302e828dd

Authored by Thanasis Naskos
0 parents
Exists in master

first commit

Showing 24 changed files with 4429 additions and 0 deletions

supervised/Supervised_Predictor_v4.r View file @ f19350b
... ... @@ -0,0 +1,703 @@
  1 +suppressMessages(library(CORElearn))
  2 +suppressMessages(library(dplyr))
  3 +suppressMessages(library(plyr))
  4 +suppressMessages(library(data.table))
  5 +suppressMessages(library(randomForest))
  6 +suppressMessages(library(ggplot2))
  7 +suppressMessages(library(grid))
  8 +suppressMessages(library(argparser))
  9 +suppressMessages(library(stringr))
  10 +
  11 +export_ds_for_spm <- function(target_event,episodes_list,output){
  12 + if (file.exists(output)) {
  13 + file.remove(output)
  14 + }
  15 + #output for HirateYamana
  16 + for(ep_index in (1:length(episodes_list))){
  17 + ep = episodes_list[[ep_index]][ , !(names(episodes_list[[ep_index]]) %in% c("Timestamps"))]
  18 + ep_list = list()
  19 + for(i in (1:nrow(ep))){
  20 + matches = which(ep[i,] %in% c(1))
  21 + if(length(matches) == 0){
  22 + next
  23 + }
  24 + line=paste(matches,collapse=" ")
  25 + ep_list[i] = line
  26 + }
  27 + if(length(ep_list) == 0){
  28 + next
  29 + }
  30 + ep_list[length(ep_list)+1] = target_event
  31 + episode = ""
  32 + for(ep_lli in (1:length(ep_list))){
  33 + if(length(ep_list[[ep_lli]]) > 0){
  34 + index = paste(paste("<",ep_lli,sep=""),">",sep="")
  35 + if(episode == ""){
  36 + episode = paste(index,ep_list[[ep_lli]],sep=" ")
  37 + } else {
  38 + episode = paste(episode,paste(index,ep_list[[ep_lli]],sep=" "),sep=" -1 ")
  39 + }
  40 + }
  41 + }
  42 + write(paste(episode,"-1 -2"),file=output,append=TRUE)
  43 + }
  44 +}
  45 +
  46 +remove_rare_events <- function(ds,target_event_frequency_proportion_rare){
  47 + if(!csv){
  48 + print("~~~~~~~APPLYING PREPROCESSING REMOVE RARE EVENTS~~~~~~~")
  49 + }
  50 + a = table(ds$Event_id)
  51 + target_event_frequency = a[names(a)==target_event][[1]]
  52 + rare_events = as.integer(names(a[a < target_event_frequency*target_event_frequency_proportion_rare]))
  53 + return(ds[!(ds$Event_id %in% rare_events),])
  54 +}
  55 +
  56 +remove_frequent_events <- function(ds,max_event_frequency_proportion_frequent){
  57 + if(!csv){
  58 + print("~~~~~~~APPLYING PREPROCESSING REMOVE FREQUENT EVENTS~~~~~~~")
  59 + }
  60 + a = table(ds[!(ds$Event_id == target_event),]$Event_id)
  61 + max_freq = sort(a,decreasing = TRUE)[[1]]
  62 + frequent_events = as.integer(names(a[a > max_freq*max_event_frequency_proportion_frequent]))
  63 + #print(frequent_events)
  64 + return(ds[!(ds$Event_id %in% frequent_events),])
  65 +}
  66 +
  67 +keep_only_first_occureness <- function(episodes_list){
  68 + if(!csv){
  69 + print("~~~~~~~APPLYING PREPROCESSING KEEP ONLY FIRST OCCURENESS~~~~~~~")
  70 + }
  71 + #for every episode in the episodes_list
  72 + for(ep_index in (1:length(episodes_list))){
  73 + ep = episodes_list[[ep_index]]
  74 + #For every segment of each episode starting from the end up to the second segment.
  75 + #We need to keep only the 1st occurness of consequtive events, hence starting from the end is the easy way.
  76 + if(nrow(ep) < 2){
  77 + next
  78 + }
  79 + for(i in (nrow(ep):2)){
  80 + #as we deal with binary vectors, to find the indeces that both vectors have "1" we sum them and check for "2"s in the result
  81 + matches = which((ep[i,]+ep[i-1,]) %in% c(2))
  82 + #replace the 1s with 0s in the matching positions of the segment that is closer to the end of the episode
  83 + ep[i,][c(matches)] = 0
  84 + }
  85 + episodes_list[[ep_index]] = ep
  86 + }
  87 + return(episodes_list)
  88 +}
  89 +
  90 +mil_text <- function(milw,F_thres,episodes_list,b_length){
  91 + if(!csv){
  92 + print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
  93 + }
  94 + window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
  95 +
  96 + #for every episode in the episodes_list
  97 + for(ep_index in (1:length(episodes_list))){
  98 + ep = episodes_list[[ep_index]]
  99 + if(nrow(ep) < 1){
  100 + next
  101 + }
  102 + new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
  103 + i = 1
  104 + while(i <= nrow(ep)){
  105 + new_ep = rbind(new_ep,ep[i,])
  106 +
  107 + if(ep[i,][b_length+1] >= F_thres && nrow(window_df) < milw){
  108 + window_df = rbind(window_df,ep[i,])
  109 + }
  110 + if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
  111 + mean = colMeans(window_df)
  112 + mean[mean > 0] = 1
  113 + mf <- data.frame(as.list(mean))
  114 + mf[1] = ep[i,][1]
  115 + mf[b_length+1] = ep[i,][b_length+1]
  116 + colnames(mf) = colnames(new_ep)
  117 + new_ep = rbind(new_ep,mf)
  118 + if(nrow(window_df) > 1){
  119 + i = i - (nrow(window_df)-2)
  120 + }
  121 + window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
  122 + }
  123 + i = i + 1
  124 + }
  125 + episodes_list[[ep_index]] = new_ep
  126 + }
  127 + return(episodes_list)
  128 +}
  129 +
  130 +mil_image <- function(milw,F_thres,episodes_list,b_length){
  131 + if(!csv){
  132 + print("~~~~~~~APPLYING PREPROCESSING MULTI INSTANCE LEARNING~~~~~~~")
  133 + }
  134 +
  135 + #for every episode in the episodes_list
  136 + for(ep_index in (1:length(episodes_list))){
  137 + ep = episodes_list[[ep_index]]
  138 + if(nrow(ep) < 1){
  139 + next
  140 + }
  141 + new_ep = data.frame(matrix(ncol = b_length+1, nrow = 0))
  142 + #a data.frame with the vectors that need to be averaged
  143 + window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
  144 + i = 1
  145 + while(i <= nrow(ep)){
  146 + #new_ep = rbind(new_ep,ep[i,])
  147 + if(nrow(window_df) < milw){
  148 + window_df = rbind(window_df,ep[i,])
  149 + }
  150 + if((nrow(window_df) == milw || i == nrow(ep)) && nrow(window_df) > 0){
  151 + mean = colMeans(window_df)
  152 + mean[mean > 0] = 1
  153 + mf = data.frame(as.list(mean))
  154 + mf[1] = ep[i,][1]
  155 + mf[b_length+1] = ep[i,][b_length+1]
  156 + #colnames(mf) = colnames(new_ep)
  157 + new_ep = rbind(new_ep,mf)
  158 + if(window_df[1,][b_length+1] >= F_thres && nrow(window_df) > 1){
  159 + i = i - (nrow(window_df)-1)
  160 + }
  161 + window_df = data.frame(matrix(ncol = b_length+1, nrow = 0))
  162 + }
  163 + i = i + 1
  164 + }
  165 + episodes_list[[ep_index]] = new_ep
  166 + }
  167 + return(episodes_list)
  168 +}
  169 +
  170 +#the Risk function
  171 +compute_F <- function(s,midpoint,t,ep_length){
  172 + #s affects the steepness
  173 + # s <- 0.9
  174 + return(1/(1+exp(s*(ep_length-midpoint-t))))
  175 +}
  176 +
  177 +#convert event vectors to binary vectors
  178 +compute_frequency_vectors <- function(aggr_episode_df,b_length,s,midpoint){
  179 + freq_aggr_episode_df <- data.frame(matrix(ncol = b_length+2, nrow = 0))
  180 + x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
  181 + # colnames(bin_aggr_episode_df) <- x
  182 +
  183 + for(i in 1:nrow(aggr_episode_df)) {
  184 + #init a vector with 3405 0s
  185 + freq_vector = as.vector(integer(b_length))
  186 + seg <- aggr_episode_df[i,]
  187 + #if segment contains the j number, replace the 0 in the bin_vector with 1
  188 + for(value in seg$x[[1]]){
  189 + freq_vector[[value]] = length(which(seg$x[[1]] == value))
  190 + }
  191 + #add a new line to the bin_aggr_epissode_df
  192 + #we use a matrix holding the elements of the new_data.frame as matrix is able to store variable of different data types
  193 + F = compute_F(s,midpoint,i-1,nrow(aggr_episode_df))
  194 + if(midpoint >= nrow(aggr_episode_df)){
  195 + F = 0
  196 + }
  197 + date = seg$Timeframe[[1]]
  198 + new_df = data.frame(matrix(c(date, freq_vector,F),nrow=1,ncol=b_length+2))
  199 + freq_aggr_episode_df <- rbind(freq_aggr_episode_df,new_df)
  200 + }
  201 + # x <- c(c("Timestamps"), c(paste("e_",c(1:3405))), c("Risk_F"))
  202 + colnames(freq_aggr_episode_df) <- x
  203 + return(freq_aggr_episode_df)
  204 +}
  205 +
  206 +create_episodes_list <- function(ds,target_event,b_length,s,midpoint,test_mode){
  207 + if(!csv){
  208 + print("~~~~~~~CREATING FREQUENCY VECTORS AND BINARIZE THEM~~~~~~~")
  209 + }
  210 + #devide in episodes
  211 + target_event_spotted = FALSE
  212 + #a list with data.frames for the episodes (each episode one data.frame)
  213 + episodes_list = list()
  214 + #data.frame for episodes
  215 + episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
  216 + #iterate over every line of the original dataset
  217 + for(i in 1:nrow(ds)) {
  218 + #get the current row of the ds
  219 + meas <- ds[i,]
  220 + #If it is the target event enable the appropriate flag
  221 + if((meas$Event_id == target_event)){
  222 + target_event_spotted = TRUE
  223 + }
  224 + #fill the episode data.frame with the events that are between two target events
  225 + if(meas$Event_id != target_event && target_event_spotted){
  226 + episode_df <- rbind(episode_df,data.frame(Timestamps=meas$Timestamps, Event_id=meas$Event_id))
  227 + } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) != 0){
  228 + #a second occurness of the target event is spotted, close the episode
  229 + #target_event_spotted = FALSE
  230 + #aggregate by day all the events to form the segments inside the episodes
  231 + aggr_episode_df = aggregate(episode_df[ ,2], FUN=function(x){return(x)}, by=list(Timeframe=cut(as.POSIXct(episode_df$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"hour"))) #%Y-%m-%dT%H:%M:%OSZ
  232 + #binarize the frequncy vector
  233 + bin_aggr_episode_df = compute_frequency_vectors(aggr_episode_df,b_length,s,midpoint)
  234 +
  235 + #Remove event 0, which does not provide any info KOUGKA
  236 + bin_aggr_episode_df = bin_aggr_episode_df[ , !(names(bin_aggr_episode_df) %in% c("e_1"))]
  237 +
  238 + #add the episode to the episodes_list
  239 + episodes_list[[length(episodes_list)+1]] = bin_aggr_episode_df
  240 + #reset episode_df to en empty data.frame
  241 + episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
  242 + } else if(meas$Event_id == target_event && target_event_spotted && is.data.frame(episode_df) && nrow(episode_df) == 0 && test_mode){
  243 + freq_vector = as.vector(integer(b_length))
  244 + new_df = data.frame(matrix(c(0, freq_vector,0),nrow=1,ncol=b_length+2))
  245 + episode_df <- rbind(episode_df,new_df)
  246 +
  247 + x <- c(c("Timestamps"), c(paste("e_",c(1:b_length),sep = "")), c("Risk_F"))
  248 + colnames(episode_df) <- x
  249 + episode_df = episode_df[ , !(names(episode_df) %in% c("e_1"))]
  250 +
  251 + #add the episode to the episodes_list
  252 + episodes_list[[length(episodes_list)+1]] = episode_df
  253 + #reset episode_df to en empty data.frame
  254 + episode_df <- data.frame(Timestamps=as.POSIXct(character()),Event_id=integer())
  255 + }
  256 + }
  257 + return(episodes_list)
  258 +}
  259 +
  260 +preprocess <- function(ds,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,w,F_thres,test_mode){
  261 +
  262 + #Remove events that appear < n times. We consider n = (target event frequency)/2
  263 + if(REMOVE_RARE_EVENTS){
  264 + ds<-remove_rare_events(ds,target_event_frequency_proportion_rare)
  265 + }
  266 +
  267 + #Remove events that appear < n times. We consider n = (target event frequency)/2
  268 + if(REMOVE_FREQUENT_EVENTS){
  269 + ds<-remove_frequent_events(ds,max_event_frequency_proportion_frequent)
  270 + }
  271 +
  272 + episodes_list = create_episodes_list(ds,target_event,b_length,s,midpoint,test_mode)
  273 + #if(length(episodes_list) == 0){
  274 + # return()
  275 + #}
  276 +
  277 + #binarize the vector
  278 + for(ep_index in (1:length(episodes_list))){
  279 + ep = episodes_list[[ep_index]]
  280 + ep[2:(ncol(ep)-1)][ep[2:(ncol(ep)-1)] > 0] = 1
  281 + episodes_list[[ep_index]] = ep
  282 + }
  283 +
  284 + # keep only the first occurness of event in consecutive segments
  285 + if(KEEP_ONLY_FIRST_OCCURENESS){
  286 + episodes_list <- keep_only_first_occureness(episodes_list)
  287 + }
  288 +
  289 + # Multi-instance learning to increase the pattern frequency
  290 + if(MULTI_INSTANCE_LEARNING_TEXT){
  291 + episodes_list <- mil_text(w,F_thres,episodes_list,b_length)
  292 + } else if(MULTI_INSTANCE_LEARNING_IMAGE){
  293 + episodes_list <- mil_image(w,F_thres,episodes_list,b_length)
  294 + }
  295 + return(episodes_list)
  296 +}
  297 +
  298 +feature_selection <- function(merged_episodes,top_features){
  299 + estReliefF <- attrEval(Risk_F ~ ., merged_episodes, estimator="RReliefFexpRank", ReliefIterations=50)
  300 + sorted_indeces = order(estReliefF, decreasing = TRUE)
  301 + merged_episodes = merged_episodes %>% select(sorted_indeces[1:top_features],ncol(merged_episodes))
  302 + return(merged_episodes)
  303 +}
  304 +
  305 +read_dataset <- function(path){
  306 + dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  307 + dataset[, 2] <- as.numeric(dataset[, 2])
  308 + return(dataset)
  309 +}
  310 +
  311 +read_dataset_cross <- function(path){
  312 + dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  313 + dataset[, 2] <- as.numeric(dataset[, 2])
  314 +
  315 + aggr_episode_df = aggregate(dataset, FUN=function(x){return(x)}, by=list(cut(as.POSIXct(dataset$Timestamps, format="%Y-%m-%dT%H:%M:%OSZ"),"day"))) #%Y-%m-%dT%H:%M:%OSZ
  316 +
  317 + training = sample_n(aggr_episode_df,(2*nrow(aggr_episode_df)/3))
  318 +
  319 + testing = setDT(aggr_episode_df)[!training, on="Group.1"]
  320 +
  321 + testing = testing[, !"Group.1"]
  322 + testing_df = data.frame(Timestamps=unlist(testing[,1][1][[1]]),Event_id=unlist(testing[,2][1][[1]]))
  323 + for(i in 2:nrow(testing)){
  324 + testing_df = rbind(testing_df,data.frame(Timestamps=unlist(testing[,1][i][[1]]),Event_id=unlist(testing[,2][i][[1]])))
  325 + }
  326 +
  327 + training = as.data.table(training)[, !"Group.1"]
  328 + training_df = data.frame(Timestamps=unlist(training[,1][1][[1]]),Event_id=unlist(training[,2][1][[1]]))
  329 + for(i in 2:nrow(training)){
  330 + training_df = rbind(training_df,data.frame(Timestamps=unlist(training[,1][i][[1]]),Event_id=unlist(training[,2][i][[1]])))
  331 + }
  332 +
  333 + return(list("training" = training_df, "testing"=testing_df))
  334 +}
  335 +
  336 +read_dataset_cross2 <- function(path,form){
  337 + dataset = read.table(path, header = TRUE, sep = ",", dec = ".", comment.char = "#")
  338 + dataset[, 2] <- as.numeric(dataset[, 2])
  339 + if(form == 1){
  340 + training_df = dataset[1:(2*nrow(dataset)/3),]
  341 + testing_df = dataset[(2*nrow(dataset)/3):nrow(dataset),]
  342 + } else if (form == 2){
  343 + training_df = dataset[(nrow(dataset)/3):nrow(dataset),]
  344 + testing_df = dataset[1:(nrow(dataset)/3),]
  345 + } else {
  346 + training_df = rbind(dataset[1:(nrow(dataset)/3),],dataset[(2*nrow(dataset)/3):nrow(dataset),])
  347 + testing_df = dataset[(nrow(dataset)/3+1):(2*nrow(dataset)/3),]
  348 + }
  349 + return(list("training" = training_df, "testing"=testing_df))
  350 +}
  351 +
  352 +find_next_fake_episodes <- function(ep_index,test_episodes_list){
  353 + fake_episodes_vec = vector()
  354 + cnt = 0
  355 + if(ep_index < length(test_episodes_list)){
  356 + for(i in ((ep_index+1):length(test_episodes_list))){
  357 + n_ep = test_episodes_list[i]
  358 + if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
  359 + cnt = cnt + 1
  360 + fake_episodes_vec <- c(fake_episodes_vec, i)
  361 + } else {
  362 + return(fake_episodes_vec)
  363 + }
  364 + }
  365 + }
  366 + return(fake_episodes_vec)
  367 +}
  368 +
  369 +find_next_close_episodes <- function(ep_index,test_episodes_list,pred_index){
  370 + ep_length = nrow(test_episodes_list[ep_index][[1]])
  371 + remaining_hours = max_warning_interval-(ep_length-pred_index)
  372 +
  373 + nc_episodes_vec = vector()
  374 + cnt = 0
  375 + while (remaining_hours > 0 && ep_index < length(test_episodes_list)) {
  376 + ep_index = ep_index + 1
  377 + n_ep = test_episodes_list[ep_index]
  378 + if(nrow(n_ep[[1]]) == 1 && n_ep[[1]]$Timestamps == 0){
  379 + next
  380 + }
  381 + n_ep_length = nrow(n_ep[[1]])
  382 + if(n_ep_length <= remaining_hours){
  383 + remaining_hours = remaining_hours-n_ep_length
  384 + if(remaining_hours >= 0){
  385 + cnt = cnt + 1
  386 + nc_episodes_vec <- c(nc_episodes_vec, ep_index)
  387 + }
  388 + } else{
  389 + return(nc_episodes_vec)
  390 + }
  391 + }
  392 + return(nc_episodes_vec)
  393 +}
  394 +
  395 +eval <- function(train_episodes,test_episodes_list,seed){
  396 + set.seed(seed)
  397 + my.rf = randomForest(Risk_F ~ .,data=train_episodes,importance=TRUE)
  398 + #varImpPlot(my.rf)
  399 + false_positives = 0
  400 + tp_vec = vector()
  401 + fp_vec = vector()
  402 + fn_vec = vector()
  403 + true_positives = 0
  404 + false_negatives = 0
  405 + ep_index = 1
  406 + while (ep_index <= length(test_episodes_list)) {
  407 + ep = test_episodes_list[[ep_index]]
  408 + fake_episodes_vec = find_next_fake_episodes(ep_index,test_episodes_list)
  409 + fake_ep_cnt = length(fake_episodes_vec)
  410 +
  411 + ep = ep[ , !(names(ep) %in% c("Timestamps"))]
  412 + Prediction <- predict(my.rf, ep)
  413 + ep_legth = length(Prediction)
  414 + pred_indeces = as.numeric(names(Prediction[Prediction >= acceptance_threshold]))
  415 + predicted_next_episodes = 0
  416 + if(length(pred_indeces) > 0){
  417 + pred_index = tail(pred_indeces, n=1)
  418 + nc_episodes_vec = find_next_close_episodes(ep_index,test_episodes_list,pred_index)
  419 +
  420 + predicted_next_episodes = length(nc_episodes_vec)
  421 + }
  422 +
  423 +
  424 + if(length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))]) > 0){
  425 + fp_reps = length(pred_indeces[pred_indeces < (ep_legth-(max_warning_interval))])
  426 + false_positives = false_positives + fp_reps + (fake_ep_cnt*fp_reps)
  427 + fp_vec = c(fp_vec,rep(ep_index,fp_reps),rep(fake_episodes_vec,fp_reps))
  428 + }
  429 + if(length(pred_indeces[pred_indeces >= (ep_legth-(max_warning_interval)) & pred_indeces <= (ep_legth-min_warning_interval)]) > 0){
  430 + true_positives = true_positives + 1 + fake_ep_cnt + predicted_next_episodes
  431 + tp_vec = c(tp_vec,ep_index,fake_episodes_vec,nc_episodes_vec)
  432 + } else {
  433 + false_negatives = false_negatives + 1 + fake_ep_cnt
  434 + fn_vec = c(fn_vec,ep_index,fake_episodes_vec)
  435 + }
  436 + ep_index = ep_index + 1 + fake_ep_cnt + predicted_next_episodes
  437 + }
  438 +
  439 + precision = true_positives/(true_positives+false_positives)
  440 + if((true_positives+false_positives) == 0){
  441 + precision = 0
  442 + }
  443 + recall = true_positives/length(test_episodes_list)
  444 +
  445 + F1 = 2*((precision*recall)/(precision+recall))
  446 + if((precision+recall) == 0){
  447 + F1 = 0
  448 + }
  449 + if(!csv){
  450 + cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\ntp:",paste(as.character(tp_vec), sep="", collapse=","), "\nfp:",paste(as.character(fp_vec), sep="", collapse=","), "\nfn:",paste(as.character(fn_vec), sep="", collapse=",")))
  451 + } else {
  452 + cat(paste(argv$test,",",true_positives,",",false_positives,",",false_negatives,",",precision,",",recall,",",F1,",",argv$fet,",",argv$tet,",",argv$rre,",",argv$rfe,",",argv$kofe,",",argv$mili,",",argv$milt,",",argv$fs,",",argv$top,",",argv$rer,",",argv$fer,",",argv$seed,",",argv$steepness,",",argv$pthres,",",argv$milw,",",argv$milthres,",",argv$midpoint,",",argv$minwint,",",argv$maxwint,",",argv$cross,",tp,",paste(as.character(tp_vec), sep="", collapse=","),",fp,",paste(as.character(fp_vec), sep="", collapse=","),",fn,",paste(as.character(fn_vec), sep="", collapse=","),"\n",sep=""))
  453 + }
  454 + return(my.rf)
  455 +}
  456 +
  457 +plot <- function(test_episodes_list, episode_index, my.rf){
  458 + test_episodes = test_episodes_list[[episode_index]][ , !(names(test_episodes_list[[episode_index]]) %in% c("Timestamps"))]
  459 + Prediction <- predict(my.rf, test_episodes)
  460 + results = data.frame(Risk_F=test_episodes$Risk_F,num_Prediction=as.numeric(Prediction))
  461 + mse = mean((Prediction-test_episodes$Risk_F)^2)
  462 +
  463 + chart =ggplot(results,aes((1:nrow(results)))) +
  464 + # geom_rect(aes(xmin = ceiling(nrow(df_test)/2), xmax = nrow(df_test), ymin = -Inf, ymax = Inf),
  465 + # fill = "yellow", alpha = 0.003) +
  466 + geom_line(aes(y = Risk_F, colour = "Actual")) +
  467 + geom_line(aes(y = num_Prediction, colour="Predicted")) +
  468 + labs(colour="Lines") +
  469 + xlab("Segments") +
  470 + ylab('Risk (F)') +
  471 + ggtitle("Risk Prediction") + # (RR_KF_2YEARS_PAT08)
  472 + theme(plot.title = element_text(hjust = 0.5)) +
  473 + geom_text(aes(label = paste("MSE=",round(mse,3)), x = 20, y = 1), hjust = -2, vjust = 6, color="black", size=4) #add MSE label
  474 +
  475 + # Disable clip-area so that the MSE is shown in the plot
  476 + gt <- ggplot_gtable(ggplot_build(chart))
  477 + gt$layout$clip[gt$layout$name == "panel"] <- "off"
  478 + grid.draw(gt)
  479 +}
  480 +
  481 +
  482 +p <- arg_parser("Implementation of the AIRBUS Predictor")
  483 +# Add a positional argument
  484 +p <- add_argument(p, "id", help="experiment ID")
  485 +p <- add_argument(p, "train", help="training dataset")
  486 +p <- add_argument(p, "test", help="test dataset")
  487 +p <- add_argument(p, "fet", help="different types of the fault events",default=151)
  488 +p <- add_argument(p, "tet", help="type of the target fault events",default=151)
  489 +p <- add_argument(p, "--rre", help="remove rare events", default=FALSE)
  490 +p <- add_argument(p, "--rfe", help="remove frequent events", default=FALSE)
  491 +p <- add_argument(p, "--kofe", help="keep only first event", default=FALSE)
  492 +p <- add_argument(p, "--milt", help="MIL as written in the text of the paper", default=FALSE)
  493 +p <- add_argument(p, "--mili", help="MIL as shonw in the Figure of the paper", default=FALSE)
  494 +p <- add_argument(p, "--milthres", help="MIL threshold to the sigmoid function for over-sampling", default=0.6)
  495 +p <- add_argument(p, "--steepness", help="steepness of the sigmoid function", default=0.8)
  496 +p <- add_argument(p, "--midpoint", help="midpoint of the sigmoid function (in days)", default=4)
  497 +p <- add_argument(p, "--fs", help="apply feature selection", default=FALSE)
  498 +p <- add_argument(p, "--top", help="# of features to keep in feature selection", default=200)
  499 +p <- add_argument(p, "--rer", help="rare events ratio of the target event frequency", default=0.2)
  500 +p <- add_argument(p, "--fer", help="frequent events ratio of the frequency of the most frequent event", default=0.8)
  501 +p <- add_argument(p, "--milw", help="MIL window size (in days)", default=4)
  502 +p <- add_argument(p, "--pthres", help="prediction threshold to the Risk value for a true positive episode", default=0.4)
  503 +p <- add_argument(p, "--seed", help="seed for RF", default=500)
  504 +p <- add_argument(p, "--csv", help="output for csv", default=FALSE)
  505 +
  506 +
  507 +p <- add_argument(p, "--spme", help="export datasets for sequential pattern minning", default=FALSE)
  508 +p <- add_argument(p, "--java", help="the java path", default="/usr/bin/java")
  509 +p <- add_argument(p, "--python", help="the java path", default="/usr/bin/python")
  510 +p <- add_argument(p, "--cep", help="the java path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spm_rules.py")
  511 +p <- add_argument(p, "--spmf", help="the spmf path", default="/media/thanasis/Storage/ATLANTIS/0_Ensembled_Predictive_Solution_EPS/spmf.jar")
  512 +p <- add_argument(p, "--conf", help="minimum support (minsup)", default="20%")
  513 +p <- add_argument(p, "--minti", help="minimum time interval allowed between two succesive itemsets of a sequential pattern", default=1)
  514 +p <- add_argument(p, "--maxti", help="maximum time interval allowed between two succesive itemsets of a sequential pattern", default=5)
  515 +p <- add_argument(p, "--minwi", help="minimum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=1)
  516 +p <- add_argument(p, "--maxwi", help="maximum time interval allowed between the first itemset and the last itemset of a sequential pattern", default=11)
  517 +p <- add_argument(p, "--minwint", help="min # of days before failure to expect a warning for true positive decision", default=1)
  518 +p <- add_argument(p, "--maxwint", help="max # of days before failure to expect a warning for true positive decision", default=8)
  519 +
  520 +p <- add_argument(p, "--cross", help="cross validation", default=3)
  521 +p <- add_argument(p, "--form", help="form", default=1)
  522 +
  523 +
  524 +
  525 +argv = data.frame()
  526 +if( length(commandArgs(trailingOnly = TRUE)) != 0){
  527 + argv <- parse_args(p)
  528 +} else {
  529 + argv <- parse_args(p,c(1,"/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch1_6405te_training.csv","/home/thanasis/Desktop/Atlantis/zBreak/Philips_AE_prediction/full_stops/all_channels/training/events1P5_LouvainPhilips_ch1_6405te_training.csv",6405,6405))
  530 +}
  531 +
  532 +
  533 +TEST_DATA = FALSE
  534 +id = argv$id
  535 +REMOVE_RARE_EVENTS = argv$rre
  536 +REMOVE_FREQUENT_EVENTS = argv$rfe
  537 +KEEP_ONLY_FIRST_OCCURENESS = argv$kofe
  538 +MULTI_INSTANCE_LEARNING_TEXT = argv$milt #MIL as explained in the text
  539 +MULTI_INSTANCE_LEARNING_IMAGE = argv$mili #MIL as presented in the figure
  540 +FEATURE_SELECTION = argv$fs
  541 +top_features = argv$top
  542 +target_event_frequency_proportion_rare = argv$rer
  543 +max_event_frequency_proportion_frequent = argv$fer
  544 +milw = argv$milw
  545 +F_thres = argv$milthres
  546 +s = argv$steepness
  547 +midpoint = argv$midpoint
  548 +target_event = argv$tet
  549 +b_length = argv$fet
  550 +acceptance_threshold = argv$pthres
  551 +export_spm = argv$spme
  552 +seed = argv$seed
  553 +csv = argv$csv
  554 +max_warning_interval = argv$maxwint
  555 +min_warning_interval = argv$minwint
  556 +
  557 +CROSS = argv$cross
  558 +
  559 +if(CROSS == 1){
  560 + #set.seed(argv$seed)
  561 + d = read_dataset_cross2(argv$train,argv$form)
  562 + training_set = d$training
  563 + test_set = d$testing
  564 +} else if(CROSS == 2) {
  565 + set.seed(argv$seed)
  566 + d = read_dataset_cross(argv$train,argv$form)
  567 + training_set = d$training
  568 + test_set = d$testing
  569 +} else {
  570 + training_set = read_dataset(argv$train)
  571 + test_set = read_dataset(argv$test)
  572 +}
  573 +
  574 +episodes_list <- preprocess(training_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,FALSE)
  575 +
  576 +#keep only 2/3 of the list
  577 +if(CROSS == 3){
  578 + episodes_list = episodes_list[1:(2*length(episodes_list)/3)]
  579 +} else if(CROSS == 4){
  580 + episodes_list = episodes_list[((length(episodes_list)/3)+1):length(episodes_list)]
  581 +} else if(CROSS == 5) {
  582 + episodes_list = episodes_list[-(((length(episodes_list)/3)+1):((2*length(episodes_list)/3)))]
  583 +}
  584 +
  585 +#merge episodes
  586 +merged_episodes = ldply(episodes_list, data.frame)
  587 +merged_episodes = merged_episodes[ , !(names(merged_episodes) %in% c("Timestamps"))]
  588 +
  589 +
  590 +if(FEATURE_SELECTION){
  591 + #remove columns with all values equal to zero
  592 + merged_episodes = merged_episodes[, colSums(merged_episodes != 0) > 0]
  593 + merged_episodes = feature_selection(merged_episodes,top_features)
  594 +}
  595 +
  596 +TEST_DATA = TRUE
  597 +REMOVE_RARE_EVENTS = FALSE
  598 +REMOVE_FREQUENT_EVENTS = FALSE
  599 +KEEP_ONLY_FIRST_OCCURENESS = FALSE
  600 +MULTI_INSTANCE_LEARNING_TEXT = FALSE #MIL as explained in the text
  601 +MULTI_INSTANCE_LEARNING_IMAGE = FALSE #MIL as presented in the figure
  602 +FEATURE_SELECTION = FALSE
  603 +test_episodes_list <- preprocess(test_set,TEST_DATA,REMOVE_RARE_EVENTS,REMOVE_FREQUENT_EVENTS,KEEP_ONLY_FIRST_OCCURENESS,MULTI_INSTANCE_LEARNING_TEXT,MULTI_INSTANCE_LEARNING_IMAGE,FEATURE_SELECTION,top_features,s,midpoint,b_length,target_event,target_event_frequency_proportion_rare,max_event_frequency_proportion_frequent,milw,F_thres,TRUE)
  604 +
  605 +#keep last 1/3 of episodes
  606 +if(CROSS == 3){
  607 + test_episodes_list = test_episodes_list[((2*length(test_episodes_list)/3)+1):length(test_episodes_list)]
  608 +} else if(CROSS == 4){
  609 + test_episodes_list = test_episodes_list[1:(length(test_episodes_list)/3)]
  610 +} else if(CROSS == 5) {
  611 + test_episodes_list = test_episodes_list[(((length(test_episodes_list)/3)+1):((2*length(test_episodes_list)/3)))]
  612 +}
  613 +
  614 +my.rf = eval(merged_episodes,test_episodes_list,seed)
  615 +
  616 +# for(s in (0:6)){
  617 +# my.rf = eval(merged_episodes,test_episodes_list,seed)
  618 +# seed = seed + 1
  619 +# }
  620 +
  621 +# for(ep in 1:length(test_episodes_list)){
  622 +# jpeg(paste(ep,'_rplot.jpg'))
  623 +# plot(test_episodes_list,ep,my.rf)
  624 +# dev.off()
  625 +# }
  626 +
  627 +if(export_spm){
  628 + if(!csv){
  629 + print("~~~~~~~SEQUENTIAL PATTERN MINING~~~~~~~")
  630 + }
  631 + spm_train_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$train)
  632 + spm_test_path = gsub(".csv",paste("_spm_",id,".csv",sep=""),argv$test)
  633 + spm_results_path = gsub(".csv",paste("_results_",id,".csv",sep=""),argv$train)
  634 + confidence = argv$conf
  635 + min_dist_seq = argv$minti
  636 + max_dist_seq = argv$maxti
  637 + min_dist_first_last = argv$minwi
  638 + max_dist_first_last = argv$maxwi
  639 + java_path = argv$java
  640 + jspmf_path = argv$spmf
  641 + python_path = argv$python
  642 + cep_path = argv$cep
  643 + max_warning_interval = argv$maxwint
  644 + min_warning_interval = argv$minwint
  645 + export_ds_for_spm(target_event,episodes_list,spm_train_path)
  646 + export_ds_for_spm(target_event,test_episodes_list,spm_test_path)
  647 +
  648 + if (file.exists(spm_results_path)) {
  649 + invisible(file.remove(spm_results_path))
  650 + }
  651 +
  652 + javaOutput <- system(paste(java_path,"-jar",jspmf_path,"run HirateYamana",spm_train_path,spm_results_path,confidence,min_dist_seq,max_dist_seq,min_dist_first_last,max_dist_first_last), intern = TRUE)
  653 + #print(javaOutput)
  654 +
  655 + pythonOutput <- system(paste(python_path,cep_path,spm_results_path,spm_test_path,target_event), intern = TRUE)
  656 + #print(pythonOutput)
  657 + true_positives = 0
  658 + false_positives = 0
  659 + false_negatives = 0
  660 + total_failures = 0
  661 + d = 0
  662 +
  663 + warnings = list()
  664 + for(w in pythonOutput){
  665 + d = as.integer(str_extract(w, "\\-*\\d+\\.*\\d*"))
  666 + if(!grepl("Failure",w,fixed=TRUE)){
  667 + warnings = c(warnings,d)
  668 + } else {
  669 + total_failures = total_failures + 1
  670 + if(length(warnings) == 0){
  671 + false_negatives = false_negatives + 1
  672 + } else {
  673 + if(length(warnings[warnings < d-max_warning_interval]) > 0){
  674 + false_positives = false_positives + length(warnings[warnings < d-max_warning_interval])
  675 + }
  676 + if(length(warnings[warnings >= (d-max_warning_interval)]) > 0 & length(warnings[warnings <= (d-min_warning_interval)]) > 0){
  677 + true_positives = true_positives + 1
  678 + } else {
  679 + false_negatives = false_negatives + 1
  680 + }
  681 + }
  682 + warnings = list()
  683 + }
  684 + }
  685 +
  686 + precision = true_positives/(true_positives+false_positives)
  687 + if((true_positives+false_positives) == 0){
  688 + precision = 0
  689 + }
  690 +
  691 + recall = true_positives/total_failures
  692 +
  693 + F1 = 2*((precision*recall)/(precision+recall))
  694 + if((precision+recall) == 0){
  695 + F1 = 0
  696 + }
  697 +
  698 + if(!csv){
  699 + cat(paste("dataset:",argv$test,"\ntrue_positives:", true_positives,"\nfalse_positives:", false_positives,"\nfalse_negatives:", false_negatives,"\nprecision:", precision,"\nrecall:", recall,"\nF1:", F1, "\n"))
  700 + } else {
  701 + cat(paste(argv$test,",", true_positives,",", false_positives,",", false_negatives,",", precision,",", recall,",", F1,",",argv$conf,",",argv$minti,",",argv$maxti,",",argv$minwi,",",argv$maxwi,",",argv$minwint,",",argv$maxwint, "\n",sep=""))
  702 + }
  703 +}
unsupervised/pom.xml View file @ f19350b
... ... @@ -0,0 +1,53 @@
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 + <modelVersion>4.0.0</modelVersion>
  4 + <groupId>com</groupId>
  5 + <artifactId>aFaultDetection</artifactId>
  6 + <version>5.0</version>
  7 + <packaging>jar</packaging>
  8 + <properties>
  9 + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  10 + <maven.compiler.source>1.8</maven.compiler.source>
  11 + <maven.compiler.target>1.8</maven.compiler.target>
  12 + <mainClass>com.atl.smartmaintenance.AFaultDetection</mainClass>
  13 + </properties>
  14 + <dependencies>
  15 + <dependency>
  16 + <groupId>org.influxdb</groupId>
  17 + <artifactId>influxdb-java</artifactId>
  18 + <version>2.10</version>
  19 + <type>jar</type>
  20 + </dependency>
  21 +
  22 + </dependencies>
  23 +
  24 + <build>
  25 + <plugins>
  26 + <plugin>
  27 + <artifactId>maven-assembly-plugin</artifactId>
  28 + <configuration>
  29 + <appendAssemblyId>false</appendAssemblyId>
  30 + <archive>
  31 + <manifest>
  32 + <mainClass>${mainClass}</mainClass>
  33 + </manifest>
  34 + </archive>
  35 + <descriptorRefs>
  36 + <descriptorRef>jar-with-dependencies</descriptorRef>
  37 + </descriptorRefs>
  38 + </configuration>
  39 + <executions>
  40 + <execution>
  41 + <id>make-assembly</id> <!-- this is used for inheritance merges -->
  42 + <phase>package</phase> <!-- bind to the packaging phase -->
  43 + <goals>
  44 + <goal>single</goal>
  45 + </goals>
  46 + </execution>
  47 + </executions>
  48 + </plugin>
  49 + </plugins>
  50 + </build>
  51 +
  52 +
  53 +</project>
0 54 \ No newline at end of file
unsupervised/src/main/java/com/atl/afaultdetection/utils/InfluxDBHandler.java View file @ f19350b
... ... @@ -0,0 +1,203 @@
  1 +package com.atl.afaultdetection.utils;
  2 +
  3 +import com.atl.mcod.utils.Data;
  4 +import com.atl.mcod.utils.Constants;
  5 +import java.security.KeyManagementException;
  6 +import java.security.NoSuchAlgorithmException;
  7 +import java.security.cert.CertificateException;
  8 +import java.time.Duration;
  9 +import java.util.HashMap;
  10 +import java.util.List;
  11 +import java.util.Map;
  12 +import java.util.concurrent.TimeUnit;
  13 +import java.util.logging.Level;
  14 +import java.util.logging.Logger;
  15 +import javax.net.ssl.HostnameVerifier;
  16 +import javax.net.ssl.SSLContext;
  17 +import javax.net.ssl.SSLSession;
  18 +import javax.net.ssl.TrustManager;
  19 +import javax.net.ssl.X509TrustManager;
  20 +import okhttp3.OkHttpClient;
  21 +import org.influxdb.InfluxDB;
  22 +import org.influxdb.InfluxDBFactory;
  23 +import org.influxdb.dto.Point;
  24 +import org.influxdb.dto.Query;
  25 +import org.influxdb.dto.QueryResult;
  26 +
  27 +/**
  28 + *
  29 + * @author thanasis
  30 + */
  31 +public class InfluxDBHandler {
  32 +
  33 + private InfluxDB influxDB;
  34 + private String database;
  35 + private String resultsDatabase;
  36 + private Map<String, Data> lastReportedMap; //to avoid multiple reports
  37 +
  38 +// public InfluxDBHandler(String host, String port, String username, String password, String database) {
  39 +// OkHttpClient.Builder builder = new OkHttpClient.Builder()
  40 +// .readTimeout(100, TimeUnit.SECONDS).connectTimeout(1000, TimeUnit.SECONDS);
  41 +// influxDB = InfluxDBFactory.connect("https://" + host + ":" + port, username, password, builder);
  42 +// influxDB.setDatabase(database);
  43 +// this.database = database;
  44 +// this.lastReportedMap = new HashMap<>();
  45 +// }
  46 +
  47 + public InfluxDBHandler(String host, String port, String database, String resutlsDatabase, String username, String password, boolean ssl) {
  48 + try {
  49 + OkHttpClient.Builder builder = null;
  50 + String http = "http://";
  51 + if(ssl){
  52 + // Create a trust manager that does not validate certificate chains
  53 + final TrustManager[] trustAllCerts = new TrustManager[]{
  54 + new X509TrustManager() {
  55 + @Override
  56 + public void checkClientTrusted(java.security.cert.X509Certificate[] chain, String authType) throws CertificateException {
  57 + }
  58 +
  59 + @Override
  60 + public void checkServerTrusted(java.security.cert.X509Certificate[] chain, String authType) throws CertificateException {
  61 + }
  62 +
  63 + @Override
  64 + public java.security.cert.X509Certificate[] getAcceptedIssuers() {
  65 + return new java.security.cert.X509Certificate[]{};
  66 + }
  67 + }
  68 + };
  69 +
  70 + // Install the all-trusting trust manager
  71 + SSLContext sc = SSLContext.getInstance("SSL");
  72 + sc.init(null, trustAllCerts, new java.security.SecureRandom());
  73 +
  74 + builder = new OkHttpClient.Builder()
  75 + .sslSocketFactory(sc.getSocketFactory(), (X509TrustManager) trustAllCerts[0]).readTimeout(100, TimeUnit.SECONDS).connectTimeout(1000, TimeUnit.SECONDS);
  76 + builder.hostnameVerifier(new HostnameVerifier() {
  77 + @Override
  78 + public boolean verify(String hostname, SSLSession session) {
  79 + return true;
  80 + }
  81 + });
  82 + http = "https://";
  83 + } else {
  84 + builder = new OkHttpClient.Builder()
  85 + .readTimeout(100, TimeUnit.SECONDS).connectTimeout(1000, TimeUnit.SECONDS);
  86 + http = "http://";
  87 + }
  88 + if(username == null || username.equals("")){
  89 + influxDB = InfluxDBFactory.connect(http + host + ":" + port, builder);
  90 + } else {
  91 + influxDB = InfluxDBFactory.connect(http + host + ":" + port, username, password, builder);
  92 + }
  93 + influxDB.setDatabase(database);
  94 + this.database = database;
  95 + this.resultsDatabase = resutlsDatabase;
  96 +
  97 + if(!dbExists(this.resultsDatabase)){
  98 + influxDB.query(new Query("create database " + this.resultsDatabase + " with name \"autogen\"",resutlsDatabase));
  99 + }
  100 +
  101 + this.lastReportedMap = new HashMap<>();
  102 + } catch (NoSuchAlgorithmException ex) {
  103 + Logger.getLogger(InfluxDBHandler.class.getName()).log(Level.SEVERE, null, ex);
  104 + } catch (KeyManagementException ex) {
  105 + Logger.getLogger(InfluxDBHandler.class.getName()).log(Level.SEVERE, null, ex);
  106 + }
  107 + }
  108 +
  109 + public InfluxDB getInfluxDBHandler() {
  110 + return influxDB;
  111 + }
  112 +
  113 + public QueryResult getMeasurements(String measurementName, long seconds, boolean excludeZeroValues){
  114 + if(influxDB != null){
  115 + influxDB.setDatabase(this.database);
  116 + Query query = new Query("SELECT mean(value) FROM "+measurementName+" WHERE time > now() - "+seconds+"s and time <= now() "+(excludeZeroValues?"and value != 0":"")+" group by time(1s) fill(linear)", database);
  117 + QueryResult results = influxDB.query(query);
  118 + return results;
  119 + } else {
  120 + System.err.print("No DB connection");
  121 + }
  122 + return new QueryResult();
  123 + }
  124 + //for debugging
  125 + public QueryResult getMeasurementsDebug(String measurementName, long seconds, boolean excludeZeroValues) {
  126 + if (influxDB != null) {
  127 + influxDB.setDatabase(this.database);
  128 +// Query query = new Query("SELECT mean(value) FROM " + measurementName + " WHERE time > '2018-01-25T00:00:01.004Z' + " + (seconds - Constants.slide) + "s and time <= '2018-01-25T00:00:01.004Z' + " + seconds + "s " + (excludeZeroValues ? "and value != 0" : "") + " group by time(1s) fill(linear)", database);
  129 + //Query query = new Query("SELECT mean(value) FROM " + measurementName + " WHERE time > '2018-07-05T00:00:01.004Z' + " + (seconds - Constants.slide) + "s and time <= '2018-07-05T00:00:01.004Z' + " + seconds + "s " + (excludeZeroValues ? "and value != 0" : "") + " group by time(1s) fill(linear)", database);
  130 + //FOR AE TESTING
  131 + Query query = new Query("SELECT value AS \"mean\" FROM " + measurementName + " WHERE time > '2018-08-19T00:00:01.004Z' + " + (seconds - Constants.slide) + "s and time <= '2018-08-19T00:00:01.004Z' + " + seconds + "s " /*+ (excludeZeroValues ? "and value != 0" : "") + ""*/, database);
  132 + //FOR THICK TESTING
  133 + //Query query = new Query("SELECT value AS \"mean\" FROM " + measurementName + " WHERE time > '2018-10-14T00:00:01.004Z' + " + (seconds - Constants.slide) + "s and time <= '2018-10-14T00:00:01.004Z' + " + seconds + "s " /*+ (excludeZeroValues ? "and value != 0" : "") + ""*/, database);
  134 + QueryResult results = influxDB.query(query);
  135 + return results;
  136 + } else {
  137 + System.err.print("No DB connection");
  138 + }
  139 + return new QueryResult();
  140 + }
  141 +
  142 + public Double getMinValue(String measurementName, boolean excludeZeroValues, boolean excludeNegativeValues) {
  143 + if (influxDB != null) {
  144 + influxDB.setDatabase(this.database);
  145 + Query query = new Query("SELECT min(value) FROM " + measurementName + (excludeNegativeValues && excludeZeroValues ? " where value > 0" : excludeZeroValues ? " where value != 0" : excludeNegativeValues ? " where value >= 0" : ""), database);
  146 + QueryResult results = influxDB.query(query);
  147 + return (Double) results.getResults().get(0).getSeries().get(0).getValues().get(0).get(1);
  148 + } else {
  149 + System.err.print("No DB connection");
  150 + }
  151 + return null;
  152 + }
  153 +
  154 + public Double getMaxValue(String measurementName) {
  155 + if (influxDB != null) {
  156 + influxDB.setDatabase(this.database);
  157 + Query query = new Query("SELECT max(value) FROM " + measurementName, database);
  158 + QueryResult results = influxDB.query(query);
  159 + return (Double) results.getResults().get(0).getSeries().get(0).getValues().get(0).get(1);
  160 + } else {
  161 + System.err.print("No DB connection");
  162 + }
  163 + return null;
  164 + }
  165 +
  166 + public void reportOutlier(Data outlier, String label, String ruleMeas) {
  167 + influxDB.setDatabase(this.resultsDatabase);
  168 + //don't report outliers with distance lower than 5mins
  169 +// if (lastReportedMap.containsKey(label)) {
  170 +// if (Duration.between(lastReportedMap.get(label).getActualTime(), outlier.getActualTime()).toMinutes() < 5 ) {
  171 +// return;
  172 +// }
  173 +// }
  174 + if(Constants.debug || Constants.verbose){
  175 + //System.out.println(label + " " + outlier.getActualTime().toString());
  176 + }
  177 +
  178 + influxDB.setRetentionPolicy("autogen");
  179 + influxDB.write(Point.measurement(Constants.resutlsDBTable)
  180 + .time(outlier.getActualTime().toEpochMilli(), TimeUnit.MILLISECONDS)
  181 + //.addField("report_time", System.currentTimeMillis())
  182 + .addField("report_time", outlier.getOutlier_start_time())
  183 + .addField("reason", "")
  184 + .tag("sensor", ruleMeas.isEmpty()?Constants.sensors:ruleMeas)
  185 + .tag("seen", "")
  186 + .tag("submitted", "")
  187 + .tag("machine", "A-ERWAERM1-PRESSE")
  188 + .tag("label", label)
  189 + .build());
  190 + lastReportedMap.put(label, outlier);
  191 + }
  192 +
  193 + private boolean dbExists(String database){
  194 + boolean exists = false;
  195 + QueryResult query = influxDB.query(new Query("show databases",database));
  196 + for (List<Object> v : query.getResults().get(0).getSeries().get(0).getValues()){
  197 + if(v.get(0).equals(database)){
  198 + return true;
  199 + }
  200 + }
  201 + return exists;
  202 + }
  203 +}
unsupervised/src/main/java/com/atl/afaultdetection/utils/Measurement.java View file @ f19350b
... ... @@ -0,0 +1,46 @@
  1 +/*
  2 + * To change this license header, choose License Headers in Project Properties.
  3 + * To change this template file, choose Tools | Templates
  4 + * and open the template in the editor.
  5 + */
  6 +package com.atl.afaultdetection.utils;
  7 +
  8 +/**
  9 + *
  10 + * @author thanasis
  11 + */
  12 +public class Measurement {
  13 + private String measurementDBName;
  14 + private Double minValue;
  15 + private Double maxValue;
  16 +
  17 + public Measurement(String measurementDBName) {
  18 + this.measurementDBName = measurementDBName;
  19 + }
  20 +
  21 + public String getMeasurementDBName() {
  22 + return measurementDBName;
  23 + }
  24 +
  25 + public void setMeasurementDBName(String measurementDBName) {
  26 + this.measurementDBName = measurementDBName;
  27 + }
  28 +
  29 + public Double getMinValue() {
  30 + return minValue;
  31 + }
  32 +
  33 + public void setMinValue(Double minValue) {
  34 + this.minValue = minValue;
  35 + }
  36 +
  37 + public Double getMaxValue() {
  38 + return maxValue;
  39 + }
  40 +
  41 + public void setMaxValue(Double maxValue) {
  42 + this.maxValue = maxValue;
  43 + }
  44 +
  45 +
  46 +}
unsupervised/src/main/java/com/atl/afaultdetection/utils/Utils.java View file @ f19350b
... ... @@ -0,0 +1,105 @@
  1 +/*
  2 + * To change this license header, choose License Headers in Project Properties.
  3 + * To change this template file, choose Tools | Templates
  4 + * and open the template in the editor.
  5 + */
  6 +package com.atl.afaultdetection.utils;
  7 +
  8 +import com.atl.mcod.utils.Constants;
  9 +import java.time.Instant;
  10 +import java.util.Iterator;
  11 +import java.util.Map;
  12 +
  13 +/**
  14 + *
  15 + * @author thanasis
  16 + */
  17 +public class Utils {
  18 + public static double normalize(double value, double minValue, double maxValue){
  19 + return (value - minValue) / (maxValue - minValue);
  20 + }
  21 +
  22 + public static boolean allIteratorsHaveNext(Map<String,Iterator<Map.Entry<Instant,Double>>> slideDataIteratorsMap) {
  23 + for(Map.Entry<String, Iterator<Map.Entry<Instant,Double>>> entry : slideDataIteratorsMap.entrySet()){
  24 + if(!entry.getValue().hasNext()){
  25 + return false;
  26 + }
  27 + }
  28 + return true;
  29 + }
  30 +
  31 + public static String checkRuleBasedParameters(Map<String, Measurement> measMap) {
  32 + String ruleMeas = "";
  33 + if (Constants.algorithms.contains("mcod") && Constants.maxRule == null && Constants.minRule == null) {
  34 + System.err.println("No rule set for the rule based");
  35 + System.exit(5);
  36 + }
  37 + if (Constants.algorithms.contains("mcodKnowledge") && Constants.maxRule_k == null && Constants.minRule_k == null) {
  38 + System.err.println("No rule set for the rule based knowledge");
  39 + System.exit(5);
  40 + }
  41 + if (Constants.maxRule != null) {
  42 + String ruleQuickMeas = Constants.maxRule.split("-")[0];
  43 + if (!measMap.containsKey(Constants.measShortNameMap.get(ruleQuickMeas))) {
  44 + System.err.println("The measurement of the max rule should be passed using the --meas flag");