#Author: Naskos Athanasios
import numpy as np
import matplotlib.pyplot as plt
import csv, random, sys
import pandas as pd
n_fault_type = 1500
training_days = 3*365
testing_days = 2*365
pattern_length = 6
min_n_days_before_target = 4
max_n_days_before_target = 4
min_n_days_between_pattern_events = 1
max_n_days_between_pattern_events = 3
pattern_clarity = 0.9
partial_pattern_percentage = 0.5
min_pattern_forms = 1
max_pattern_forms = 4
max_dist_between_events = 50
shuffled_pattern = False
seed = 4
ds_type = "training"
def partial_pattern_clarity_choice(probability):
return r.rand() < probability
for i in range(1,len(sys.argv)):
arg = sys.argv[i]
if("--n_fault_type" in arg):
n_fault_type = int(sys.argv[i+1])
elif("--training_days" in arg):
training_days = int(sys.argv[i+1])
elif("--testing_days" in arg):
testing_days = int(sys.argv[i+1])
elif("--min_n_days_before_target" in arg):
min_n_days_before_target = int(sys.argv[i+1])
elif("--max_n_days_before_target" in arg):
max_n_days_before_target = int(sys.argv[i+1])
elif("--min_n_days_between_pattern_events" in arg):
min_n_days_between_pattern_events = int(sys.argv[i+1])
elif("--max_n_days_between_pattern_events" in arg):
max_n_days_between_pattern_events = int(sys.argv[i+1])
elif("--max_dist_between_events" in arg):
max_dist_between_events = int(sys.argv[i+1])
elif("--partial_pattern_percentage" in arg):
partial_pattern_percentage = float(sys.argv[i+1])
elif("--pattern_length" in arg):
pattern_length = int(sys.argv[i+1])
elif("--pattern_clarity" in arg):
pattern_clarity = float(sys.argv[i+1])
elif("--min_pattern_forms" in arg):
min_pattern_forms = int(sys.argv[i+1])
elif("--max_pattern_forms" in arg):
max_pattern_forms = int(sys.argv[i+1])
elif("--shuffled_pattern" in arg):
shuffled_pattern = bool(sys.argv[i+1])
elif("--seed" in arg):
seed = int(sys.argv[i+1])
n_days = training_days + testing_days
r = np.random.RandomState()
r.seed(seed)
target_event = n_fault_type + 1
#Build the pattern forms
pattern_dict = {}
pattern_num = 1
for p in range(1,pattern_length+1):
pattern_event = target_event - p
pattern_dict[pattern_event] = [pattern_event]
forms = 1
if max_pattern_forms != 1:
forms = r.randint(min_pattern_forms,max_pattern_forms+1)
for pf in range(1,forms):
pattern_dict[pattern_event].append(target_event-pattern_length-pattern_num)
pattern_num += 1
#create the output file
name = str(n_fault_type)+"ft_"+str(target_event)+"vl_"+str(n_days)+"d_"+str(int(pattern_clarity*100))+"pc_"+str(int(partial_pattern_percentage*100))+"ppc_"+str(min_n_days_before_target)+"minbt_"+str(max_n_days_before_target)+"maxbt_"+str(min_n_days_between_pattern_events)+"minbpe_"+str(max_n_days_between_pattern_events)+"maxbpe_"+str(pattern_length)+"pl_"+str(min_pattern_forms)+"minpf_"+str(max_pattern_forms)+"maxpf_"+str(shuffled_pattern)+"ShuffledP_"+str(seed)+"seed"
ofile = open(ds_type+"_dataset_"+name+".csv", "w")
writer = csv.writer(ofile)
writer.writerow(["#target event --> " + str(target_event)])
writer.writerow(["#min different pattern types --> " + str(min_pattern_forms)])
writer.writerow(["#max different pattern types --> " + str(max_pattern_forms)])
writer.writerow(["#pattern forms --> "+str(pattern_dict).replace(",","")])
target_event_days = []
events = {}
#add the target event to the first and last day in order to use all the generated data
events[0] = [target_event]
events[n_days-1] = [target_event]
target_event_days.append(0)
target_event_days.append(n_days-1)
for event in range(1,target_event+1):
#select a random shape for the weibull dist
shape = r.uniform(0,20)
#create a weibull dist of 100 points
dist = r.weibull(shape,100).tolist()
if(event == target_event):
#the target event always follows the same weibull dist
shape = 50
#create a weibull dist of 1000 points for the target events
#(shape=50, points=1000 gives about 50 target events for 5 years of data)
dist = r.weibull(shape,100).tolist()
#normalize the dist to [0,max_dist_between_events] so that the maximum distance between events is equal to max_dist_between_events
minv = min(dist)
maxv = max(dist)
sum_days = 0
for i in range(0,len(dist)):
mdbe = max_dist_between_events
if(event == target_event):
mdbe = 2*max_dist_between_events
dist[i] = np.ceil(mdbe*((dist[i] - minv)/(maxv - minv))).astype('int')
#in case maxv or minv == Inf
if(dist[i] > mdbe):
dist[i] = mdbe
if(dist[i] < 0):
dist[i] = 0
#every point of the dist is mapped to a range of days, which are added to the total number of days
sum_days += dist[i]
if(sum_days < n_days):
#events dict holds the day-events mapping
if(sum_days in events.keys()):
#uncomment the following in order to not allow same events on the same day
#if(event not in events[sum_days]):
events[sum_days].append(event)
if(event == target_event):
target_event_days.append(sum_days)
else:
events[sum_days] = [event]
if(event == target_event):
target_event_days.append(sum_days)
else:
break
#if more than the n_days are created remove the spare ones
spare_days = len(events) - n_days
if(spare_days > 0):
for i in range(n_days,n_days+spare_days):
del events[i]
#compute mean events per day
sume = 0
for d in events.values():
sume += len(d)
writer.writerow(["#mean events per day --> " + str(sume/len(events))])
#compute mean frequency of the target event
sumd = 0
for i in range(len(target_event_days)-1,0,-1):
sumd += target_event_days[i] - target_event_days[i-1]
writer.writerow(["#mean frequency of target event --> " + str(sumd/(len(target_event_days)-1))])
target_event_days = sorted(target_event_days)
writer.writerow(["#days with target event --> " + str(target_event_days).replace(",","")])
print len(target_event_days)
#select the target events which will be precede by partial patterns based on the specified clarity
n_partial_patterns = int(round(len(target_event_days)*(1-pattern_clarity)))
selected_partial_days = r.choice([day for day in target_event_days if (day != 0 and day <= training_days)],n_partial_patterns,replace=False)
writer.writerow(["#days with partial pattern --> " + str(selected_partial_days)])
writer.writerow(["#shuffled pattern events --> " + str(shuffled_pattern)])
#add the pattern before the target events
for i in range(len(target_event_days)-1,0,-1):
target_event_day = target_event_days[i]
#find the position of the previous target event of the current target event
prev_target_event_day = target_event_days[i-1]
writer.writerow(["#target_event_day --> " + str(target_event_day)])
#find the day before the target event to begin placing the pattern (descending direction)
placement_day = target_event_day - r.randint(min_n_days_before_target,max_n_days_before_target+1)
#shuffle the pattern events order
pattern_range = range(1,pattern_length+1)
if(shuffled_pattern):
pattern_range = r.choice(range(1,pattern_length+1),pattern_length,replace=False)
placed_pattern_events = 0
for p in pattern_range:
#select the pattern event
pattern_event = target_event - p
#select the pattern event form
pattern_event = r.choice(pattern_dict[pattern_event],1)[0]
#find the next day for the pattern
if(placed_pattern_events > 0):
placement_day -= r.randint(min_n_days_between_pattern_events,max_n_days_between_pattern_events+1)
#place the pattern only if it is after the previous target event day
if(placement_day > prev_target_event_day):
#check if partial event should be added
if(target_event_day in selected_partial_days):
if(partial_pattern_clarity_choice(partial_pattern_percentage)):
if(placement_day in events.keys()):
events[placement_day].append(pattern_event)
else:
events[placement_day] = [pattern_event]
writer.writerow(["#" + str(pattern_event) + " --> " + str(placement_day)])
else:
if(placement_day in events.keys()):
events[placement_day].append(pattern_event)
else:
events[placement_day] = [pattern_event]
writer.writerow(["#" + str(pattern_event) + " --> " + str(placement_day)])
placed_pattern_events += 1
#remove target events (from non partial pattern cases) to meet the specified clarity
n_removed_events = int(round(len(target_event_days)*(1-pattern_clarity)))
#the target events are removed only from the training set
selected_days = r.choice([day for day in target_event_days if (day not in selected_partial_days and day != 0 and day < training_days)],n_removed_events, replace=False)
for d in selected_days:
events[d].remove(target_event)
target_event_days.remove(d)
writer.writerow(["#days with removed target event --> " + str(selected_days)])
#write the events to the appropriate training and testing files
writer.writerow(["Timestamps","Event_id"])
datelist = pd.date_range(start='1/1/2014', periods=n_days+1).tolist()
days_cnt = 0
for i in events.keys():
date = datelist[i]
if(days_cnt == training_days+1):
ofile.close()
ds_type = "testing"
ofile = open(ds_type+"_dataset_"+name+".csv", "w")
writer = csv.writer(ofile)
writer.writerow(["Timestamps","Event_id"])
for event in events[i]:
if(event in pattern_dict):
event = r.choice(pattern_dict[event],1)[0]
writer.writerow([str(date).replace(" 00:00:00",""),event])
days_cnt += 1
ofile.close()