1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef KMEANS_H 18 #define KMEANS_H 19 20 #include <cstdlib> 21 #include <math.h> 22 23 // Helper functions 24 25 template <typename T, typename N> 26 inline void sum(T values[], int len, int dimension, int stride, N dst[]) { 27 int x, y; 28 // zero out dst vector 29 for (x = 0; x < dimension; x++) { 30 dst[x] = 0; 31 } 32 for (x = 0; x < len; x+= stride) { 33 for (y = 0; y < dimension; y++) { 34 dst[y] += values[x + y]; 35 } 36 } 37 } 38 39 template <typename T, typename N> 40 inline void set(T val1[], N val2[], int dimension) { 41 int x; 42 for (x = 0; x < dimension; x++) { 43 val1[x] = val2[x]; 44 } 45 } 46 47 template <typename T, typename N> 48 inline void add(T val[], N dst[], int dimension) { 49 int x; 50 for (x = 0; x < dimension; x++) { 51 dst[x] += val[x]; 52 } 53 } 54 55 template <typename T, typename N> 56 inline void divide(T dst[], N divisor, int dimension) { 57 int x; 58 if (divisor == 0) { 59 return; 60 } 61 for (x = 0; x < dimension; x++) { 62 dst[x] /= divisor; 63 } 64 } 65 66 /** 67 * Calculates euclidean distance. 68 */ 69 70 template <typename T, typename N> 71 inline N euclideanDist(T val1[], T val2[], int dimension) { 72 int x; 73 N sum = 0; 74 for (x = 0; x < dimension; x++) { 75 N diff = (N) val1[x] - (N) val2[x]; 76 sum += diff * diff; 77 } 78 return sqrt(sum); 79 } 80 81 // K-Means 82 83 84 /** 85 * Picks k random starting points from the data set. 86 */ 87 template <typename T> 88 void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[], 89 unsigned int seed) { 90 int x, z, num_vals, cntr; 91 num_vals = len / stride; 92 cntr = 0; 93 srand(seed); 94 unsigned int r_vals[k]; 95 unsigned int r; 96 97 for (x = 0; x < k; x++) { 98 99 // ensure randomly chosen value is unique 100 int r_check = 0; 101 while (r_check == 0) { 102 r = (unsigned int) rand() % num_vals; 103 r_check = 1; 104 for (z = 0; z < x; z++) { 105 if (r == r_vals[z]) { 106 r_check = 0; 107 } 108 } 109 } 110 r_vals[x] = r; 111 r *= stride; 112 113 // set dst to be randomly chosen value 114 set<T,T>(dst + cntr, values + r, dimension); 115 cntr += stride; 116 } 117 } 118 119 /** 120 * Finds index of closet centroid to a value 121 */ 122 template <typename T, typename N> 123 inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) { 124 int best_ind = 0; 125 N best_len = euclideanDist <T, N>(values, oldCenters, dimension); 126 int y; 127 for (y = stride; y < pop_size; y+=stride) { 128 N l = euclideanDist <T, N>(values, oldCenters + y, dimension); 129 if (l < best_len) { 130 best_len = l; 131 best_ind = y; 132 } 133 } 134 return best_ind; 135 } 136 137 /** 138 * Calculates new centroids by averaging value clusters for old centroids. 139 */ 140 template <typename T, typename N> 141 int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[], 142 T dst[]) { 143 int x, pop_size; 144 pop_size = k * stride; 145 int popularities[k]; 146 N tmp[pop_size]; 147 148 //zero popularities 149 memset(popularities, 0, sizeof(int) * k); 150 // zero dst, and tmp 151 for (x = 0; x < pop_size; x++) { 152 tmp[x] = 0; 153 } 154 155 // put summation for each k in tmp 156 for (x = 0; x < len; x+=stride) { 157 int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size); 158 add<T, N>(values + x, tmp + best, dimension); 159 popularities[best / stride]++; 160 161 } 162 163 int ret = 0; 164 int y; 165 // divide to get centroid and set dst to result 166 for (x = 0; x < pop_size; x+=stride) { 167 divide<N, int>(tmp + x, popularities[x / stride], dimension); 168 for (y = 0; y < dimension; y++) { 169 if ((dst + x)[y] != (T) ((tmp + x)[y])) { 170 ret = 1; 171 } 172 } 173 set(dst + x, tmp + x, dimension); 174 } 175 return ret; 176 } 177 178 template <typename T, typename N> 179 void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride, 180 int iterations, T initialPicks[]){ 181 int k_len = k * stride; 182 int x; 183 184 // zero newCenters 185 for (x = 0; x < k_len; x++) { 186 finalCentroids[x] = 0; 187 } 188 189 T * c1 = initialPicks; 190 T * c2 = finalCentroids; 191 T * temp; 192 int ret = 1; 193 for (x = 0; x < iterations; x++) { 194 ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2); 195 temp = c1; 196 c1 = c2; 197 c2 = temp; 198 if (ret == 0) { 199 x = iterations; 200 } 201 } 202 set<T, T>(finalCentroids, c1, dimension); 203 } 204 205 /** 206 * Runs the k-means algorithm on dataset values with some initial centroids. 207 */ 208 template <typename T, typename N> 209 void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride, 210 int iterations, unsigned int seed){ 211 int k_len = k * stride; 212 T initialPicks [k_len]; 213 initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed); 214 215 runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride, 216 iterations, initialPicks); 217 } 218 219 /** 220 * Sets each value in values to the closest centroid. 221 */ 222 template <typename T, typename N> 223 void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) { 224 int x, pop_size; 225 pop_size = k * stride; 226 for (x = 0; x < len; x+= stride) { 227 int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size); 228 set<T, T>(values + x, centroids + best, dimension); 229 } 230 } 231 232 #endif // KMEANS_H 233