Home | History | Annotate | Download | only in filters
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef KMEANS_H
     18 #define KMEANS_H
     19 
     20 #include <cstdlib>
     21 #include <math.h>
     22 
     23 // Helper functions
     24 
     25 template <typename T, typename N>
     26 inline void sum(T values[], int len, int dimension, int stride, N dst[]) {
     27     int x, y;
     28     // zero out dst vector
     29     for (x = 0; x < dimension; x++) {
     30         dst[x] = 0;
     31     }
     32     for (x = 0; x < len; x+= stride) {
     33         for (y = 0; y < dimension; y++) {
     34             dst[y] += values[x + y];
     35         }
     36     }
     37 }
     38 
     39 template <typename T, typename N>
     40 inline void set(T val1[], N val2[], int dimension) {
     41     int x;
     42     for (x = 0; x < dimension; x++) {
     43         val1[x] = val2[x];
     44     }
     45 }
     46 
     47 template <typename T, typename N>
     48 inline void add(T val[], N dst[], int dimension) {
     49     int x;
     50     for (x = 0; x < dimension; x++) {
     51         dst[x] += val[x];
     52     }
     53 }
     54 
     55 template <typename T, typename N>
     56 inline void divide(T dst[], N divisor, int dimension) {
     57    int x;
     58    if (divisor == 0) {
     59        return;
     60    }
     61    for (x = 0; x < dimension; x++) {
     62        dst[x] /= divisor;
     63    }
     64 }
     65 
     66 /**
     67  * Calculates euclidean distance.
     68  */
     69 
     70 template <typename T, typename N>
     71 inline N euclideanDist(T val1[], T val2[], int dimension) {
     72     int x;
     73     N sum = 0;
     74     for (x = 0; x < dimension; x++) {
     75         N diff = (N) val1[x] - (N) val2[x];
     76         sum += diff * diff;
     77     }
     78     return sqrt(sum);
     79 }
     80 
     81 // K-Means
     82 
     83 
     84 /**
     85  * Picks k random starting points from the data set.
     86  */
     87 template <typename T>
     88 void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[],
     89         unsigned int seed) {
     90     int x, z, num_vals, cntr;
     91     num_vals = len / stride;
     92     cntr = 0;
     93     srand(seed);
     94     unsigned int r_vals[k];
     95     unsigned int r;
     96 
     97     for (x = 0; x < k; x++) {
     98 
     99         // ensure randomly chosen value is unique
    100         int r_check = 0;
    101         while (r_check == 0) {
    102             r = (unsigned int) rand() % num_vals;
    103             r_check = 1;
    104             for (z = 0; z < x; z++) {
    105                 if (r == r_vals[z]) {
    106                     r_check = 0;
    107                 }
    108             }
    109         }
    110         r_vals[x] = r;
    111         r *= stride;
    112 
    113         // set dst to be randomly chosen value
    114         set<T,T>(dst + cntr, values + r, dimension);
    115         cntr += stride;
    116     }
    117 }
    118 
    119 /**
    120  * Finds index of closet centroid to a value
    121  */
    122 template <typename T, typename N>
    123 inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) {
    124     int best_ind = 0;
    125     N best_len = euclideanDist <T, N>(values, oldCenters, dimension);
    126     int y;
    127     for (y = stride; y < pop_size; y+=stride) {
    128         N l = euclideanDist <T, N>(values, oldCenters + y, dimension);
    129         if (l < best_len) {
    130             best_len = l;
    131             best_ind = y;
    132         }
    133     }
    134     return best_ind;
    135 }
    136 
    137 /**
    138  * Calculates new centroids by averaging value clusters for old centroids.
    139  */
    140 template <typename T, typename N>
    141 int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[],
    142         T dst[]) {
    143     int x, pop_size;
    144     pop_size = k * stride;
    145     int popularities[k];
    146     N tmp[pop_size];
    147 
    148     //zero popularities
    149     memset(popularities, 0, sizeof(int) * k);
    150     // zero dst, and tmp
    151     for (x = 0; x < pop_size; x++) {
    152         tmp[x] = 0;
    153     }
    154 
    155     // put summation for each k in tmp
    156     for (x = 0; x < len; x+=stride) {
    157         int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size);
    158         add<T, N>(values + x, tmp + best, dimension);
    159         popularities[best / stride]++;
    160 
    161     }
    162 
    163     int ret = 0;
    164     int y;
    165     // divide to get centroid and set dst to result
    166     for (x = 0; x < pop_size; x+=stride) {
    167         divide<N, int>(tmp + x, popularities[x / stride], dimension);
    168         for (y = 0; y < dimension; y++) {
    169             if ((dst + x)[y] != (T) ((tmp + x)[y])) {
    170                 ret = 1;
    171             }
    172         }
    173         set(dst + x, tmp + x, dimension);
    174     }
    175     return ret;
    176 }
    177 
    178 template <typename T, typename N>
    179 void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
    180         int iterations, T initialPicks[]){
    181         int k_len = k * stride;
    182         int x;
    183 
    184         // zero newCenters
    185         for (x = 0; x < k_len; x++) {
    186             finalCentroids[x] = 0;
    187         }
    188 
    189         T * c1 = initialPicks;
    190         T * c2 = finalCentroids;
    191         T * temp;
    192         int ret = 1;
    193         for (x = 0; x < iterations; x++) {
    194             ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2);
    195             temp = c1;
    196             c1 = c2;
    197             c2 = temp;
    198             if (ret == 0) {
    199                 x = iterations;
    200             }
    201         }
    202         set<T, T>(finalCentroids, c1, dimension);
    203 }
    204 
    205 /**
    206  * Runs the k-means algorithm on dataset values with some initial centroids.
    207  */
    208 template <typename T, typename N>
    209 void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
    210         int iterations, unsigned int seed){
    211     int k_len = k * stride;
    212     T initialPicks [k_len];
    213     initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed);
    214 
    215     runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride,
    216         iterations, initialPicks);
    217 }
    218 
    219 /**
    220  * Sets each value in values to the closest centroid.
    221  */
    222 template <typename T, typename N>
    223 void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) {
    224     int x, pop_size;
    225     pop_size = k * stride;
    226     for (x = 0; x < len; x+= stride) {
    227         int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size);
    228         set<T, T>(values + x, centroids + best, dimension);
    229     }
    230 }
    231 
    232 #endif // KMEANS_H
    233