Bootstrapping

advertisement

Pseudocode for bootstrapping

Input:

Output: flat file of size MxN, containing “real” data. Assume that M is not too large. user-specified number of files of size MxN which contain “fake” data based on the real input data.

High-level pseudocode:

If the input data is numeric, generate output data based on the mean and stdev of the input data; else generate output data based on the similarity measure.

Pseudocode draft 1:

For each column:

If the input data is numeric,

//generate output data based on the mean and stdev of the input data: calculate mean and stdev classify input data based on mean and stdev calculate probabilities of input data classification for each class generate fake data else

//generate output data based on the similarity measure find out which data values are present calculate probabilities of input data classification for each class generate fake data

Pseudocode draft 2:

Assume that the input data is called s[i][j], i=1, …,M, j = 1, .., N.

Assume that the fake data are kept in file fakedata[i][j][filenumber], i=1, …,M, j = 1, ..,

N, filenumber = 1, .., Z, and Z is assigned by the user.

Assume numBounds = 4 (i.e. there are 4 bounds and thus 3 bins: min, mean-stdev, mean+stdev, max)

for (file = 1, Z) { // generate Z fake data files of size MxN for j = 1, N if numeric(j),

//go through each column calculate_mean_and_stdev(j, mean, stdev)

}

} classify_numeric_data(j, bounds[]) calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) else find_values_and_calculate_probabilities(j, vals[], prob[], numPosVals) generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])

//calculate mean and stdev for column j calculate_mean_and_stdev(j, mean, stdev) { mean =0 stdev = 0 max = +2^30 min = -2^30 for i = 1, M { //go through each row mean = mean + … max = …. min = …

} mean = mean/M – or is it N? think about it

 for i = 1, M { //go through each row stdev = stdev + …

}

} stdev = sqrt( …)

// classify input data based on mean and standard deviation such that:

// the first class is from min[j] to mean[j] – stdev[j];

// the second class is from mean[j] – stdev [j] to mean[j] + stdev [j];

// the third class is from mean[j] + stdev [j] to max[j]; classify_numeric_data(j, bounds) { bounds[0] = min[j] bounds[1] = mean[j]-stdev[j], bounds[2] = mean[j] + stdev[j] bounds[3] = max[j]

}

//calculate probabilities of input data classification for each class

calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) { int bin[2]; for i = 0, i <= 2, i++) { bin[i] =0;

} bool found = false; for i = 1, M { //go through each row while(!found) { for (i = 0; i <=2; i++) { if (s[i][j] >= bound[i] && s[i][j] <= bound[i+1] ) { bin[i]++ found = true;

}

}

}

} for (i = 0; i <=2; i++) { prob[i] = bin[i]/M;

}

}

//j: column number

// bounds[]: the array with bounds between the bins (see classify_numeric_data

//for definitions)

//numPosVals: size of values

//file: filenumber

//M: number of input samples

//N: number of columns

//fakedata: file with fake data generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) { for (i = 1, i <= M, i++) { r = rand() if r == 0 //take care of the borderline cases fakedata[i][j][file] = min[j] //i.e. bound[0] if r == 1 fakedata[i][j][file] = max[j] //i.e. bound[3]

//find which bin this value belongs to beg = 0; end = prob[0];

} k=0; bool found = false; while (!found && k <= numBounds) { if (beg < r <= end) {

//generate fakedata[i][j] value in between

//bound[k] and bound[k+1] r = rand() fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k] found = true;

} else { beg = end; end = end + prob[k+1]; k++;

}

//OR: findValueSlot(r, 0, prob[0], fakedata[][j][file], bound[])

}

} findValueSlot(beg, end, fakedata[][][], bound[]) { for (k = 0, k <= numBounds - 1, k++) { if (beg < r <= end) {

//generate fakedata[i][j] value in between

}

//bound[k] and bound[k+1] r = rand() fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k] return beg = end end = end + prob[k+1]

}

}

//Assume that the input file has a small number of possible values for this feature.

//j: column number

// vals[]: the array with possible values; vals[i] is the ith value

// prob[]: probability of each possible values; ; prob[i] = probability of ith value

//numPosVals: size of vals[] find_values_and calculate_probabilities(j, vals[], prob[], numPosVals) { vals[] = NIL; k= 0;

} for (i = 1; i<=M; i++) { if ((z= Search(s[i][j], vals[], 0, numPosVals-1)) == NIL) {

//value not in vals[]

} else {

} vals[k] = s[i][j] prob[k]++ k++ prob[z]++

} for (i=0, k-1) { prob[i]=prob[i]/M

} numPosVals = k

//j: column number

// values[]: the array with possible values; values[i] is the ith value

//prob[]: array with probability of each value; prob[i] = probability of ith value

//numPosVals: size of values

//file: filenumber

//M: number of input samples

//N: number of columns

//fakedata: file with fake data generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file]) for (i = 1, M) { r = rand()

//take care of the beginning and end value of r if r == 0 fakedata[i][j][file] = vals[0] if r == 1 fakedata[i][j][file] = vals[numPosVals-1] beg = 0 end = prob[numPosVals-1] for (k = 0, k <= numPosVals-1)) { if (beg <= r < end ) fakedata[i][j][file] = vals[k] break

}

}

}

} beg = end end = end + prob[k+1]

}

// search function that looks for value v in array A[]

// first: the index of the element from which to begin searching

// last: index of the element from which to stop

// returns NIL if the value is not found, or the index if found

Search(v, A[], first, last) { while (first <= last) {

} mid = floor((first+last)/2) if v == A[mid] return mid if v > A[mid] first = mid+1 if v < A[mid] last = mid-1

Download