/* PSPP - a program for statistical analysis. Copyright (C) 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid /* Struct KMeans: Holds all of the information for the functions. */ struct Kmeans { gsl_matrix *data; //User Data (Given by the user) gsl_matrix *centers; //Centers for groups gsl_vector_int *index; //Integer values from zero to ngroups. Shows group of an observation. gsl_vector *v1, *v2, *v3; //Temporary vector for program. Do not use. gsl_rng *rng; //Random Number Generator. int ngroups; //Number of group. (Given by the user) int n; //Number of observations. (Given by the user) int m; //Number of variables. (Given by the user) int maxiter; //Maximum number of iterations (Given by the user) int lastiter; //Show at which iteration it found the solution. int trials; //If not convergence, how many times has clustering done. double *weights; //Double values for handling weights for program use. gsl_matrix *initial_centers; //Initial random centers const struct variable **variables; //Variables }; /* Creates and returns a struct of Kmeans with given casereader 'cs', parsed variables 'variables', number of cases 'n', number of variables 'm', number of clusters and amount of maximum iterations. */ static struct Kmeans * kmeans_create (struct casereader *cs, const struct variable **variables, int n, int m, int ngroups, int maxiter) { int i, v; double x; struct ccase *c; struct Kmeans *kmeans = xmalloc (sizeof (struct Kmeans)); kmeans->data = gsl_matrix_alloc (n, m); kmeans->centers = gsl_matrix_alloc (ngroups, m); kmeans->ngroups = ngroups; kmeans->index = gsl_vector_int_alloc (n); kmeans->n = n; kmeans->m = m; kmeans->maxiter = maxiter; kmeans->lastiter = 0; kmeans->trials = 0; kmeans->variables = variables; i = 0; for (; (c = casereader_read (cs)) != NULL; case_unref (c)) { for (v = 0; v < m; ++v) { x = case_data (c, variables[v])->f; /* I think using k->data[i*p+v]=x is faster but for being convenient, i am using gsl_matrix_set() here. */ gsl_matrix_set (kmeans->data, i, v, x); } i++; } kmeans->weights = xmalloc (sizeof (double) * kmeans->index->size); kmeans->v1 = gsl_vector_alloc (kmeans->centers->size2); kmeans->v2 = gsl_vector_alloc (kmeans->centers->size2); kmeans->v3 = gsl_vector_alloc (kmeans->n); kmeans->rng = get_rng (); kmeans->initial_centers = NULL; return (kmeans); } /* Creates random centers using randomly selected cases from the data. */ static void kmeans_randomize_centers (struct Kmeans *kmeans) { int i, j; int randind; for (i = 0; i < kmeans->centers->size1; i++) { randind = (int) (gsl_rng_uniform (kmeans->rng) * kmeans->data->size1); for (j = 0; j < kmeans->centers->size2; j++) { gsl_matrix_set (kmeans->centers, i, j, gsl_matrix_get (kmeans->data, randind, j)); } } /* If it is the first iteration, the variable kmeans->initial_centers is NULL and it is created once for reporting issues. In SPSS, initial centers are shown in the reports but in PSPP it is not shown now. I am leaving it here. */ if (!kmeans->initial_centers) { kmeans->initial_centers = gsl_matrix_alloc (kmeans->centers->size1, kmeans->centers->size2); for (i = 0; i < kmeans->centers->size1; i++) { for (j = 0; j < kmeans->centers->size2; j++) { gsl_matrix_set (kmeans->initial_centers, i, j, gsl_matrix_get (kmeans->centers, i, j)); } } } } /* kmeans_randomize_index() was used at the older versions of quick_cluster but we are not using it now. I am leaving it here. */ /* static void kmeans_randomize_index (struct Kmeans *kmeans) { int i; for (i = 0; i < kmeans->index->size; i++) { kmeans->index->data[i] = -1; } } */ /* kmeans_print() may be used for debugging issues. */ /* static void kmeans_print (struct Kmeans *kmeans) { int i, j; printf ("Number of groups: %d\n", kmeans->ngroups); printf ("Centers:\n"); for (i = 0; i < kmeans->centers->size1; i++) { for (j = 0; j < kmeans->centers->size2; j++) { printf ("%f ", gsl_matrix_get (kmeans->centers, i, j)); } printf ("\n"); } printf ("Index:\n"); for (i = 0; i < kmeans->n; i++) { printf ("%d ", gsl_vector_int_get (kmeans->index, i)); } printf ("\nLast iter: %d\n", kmeans->lastiter); } */ /* Prints a gsl_matrix. I am leaving it here for debugging issues. */ /* static void print_matrix (gsl_matrix * m) { int i, j; for (i = 0; i < m->size1; i++) { for (j = 0; j < m->size2; j++) { printf ("%f ", m->data[i * m->size2 + j]); } printf ("\n"); } } */ /* Calculates the squared euclidean distances between two vectors. Square root is not used here for decreasing computation times. Comparing squared distances equals to comparing non-squared ones. */ static double kmeans_euclidean_distance (gsl_vector * v1, gsl_vector * v2) { double result = 0.0; double val; int i; for (i = 0; i < v1->size; i++) { val = v1->data[i] - v2->data[i]; result += val * val; } return (result); } /* Returns the actual number of cases of a cluster */ static int kmeans_num_elements_group (struct Kmeans *kmeans, int group) { int total = 0; int i; for (i = 0; i < kmeans->n; i++) { if (gsl_vector_int_get (kmeans->index, i) == group) total++; } return (total); } /* Re-calculates the cluster centers */ static void kmeans_recalculate_centers (struct Kmeans *kmeans) { int i, j, h; int elm; double mean; gsl_vector *v1 = kmeans->v3; for (i = 0; i < kmeans->ngroups; i++) { elm = kmeans_num_elements_group (kmeans, i); for (j = 0; j < kmeans->index->size; j++) { if (gsl_vector_int_get (kmeans->index, j) == i) { kmeans->weights[j] = 1.0; } else { kmeans->weights[j] = 0.0; } } for (h = 0; h < kmeans->m; h++) { gsl_matrix_get_col (v1, kmeans->data, h); mean = gsl_stats_wmean (kmeans->weights, 1, v1->data, 1, v1->size); gsl_matrix_set (kmeans->centers, i, h, mean); } } } /* The variable index in struct Kmeans holds integer values that represents the current groups of cases. index[n]=a shows the nth case is belong to ath cluster. */ static void kmeans_calculate_indexes (struct Kmeans *kmeans) { double dist; double mindist; int bestindex = 0; int i, j; gsl_vector *v1 = kmeans->v1; gsl_vector *v2 = kmeans->v2; for (i = 0; i < kmeans->n; i++) { mindist = INFINITY; gsl_matrix_get_row (v1, kmeans->data, i); for (j = 0; j < kmeans->ngroups; j++) { gsl_matrix_get_row (v2, kmeans->centers, j); dist = kmeans_euclidean_distance (v1, v2); if (dist < mindist) { mindist = dist; bestindex = j; } } gsl_vector_int_set (kmeans->index, i, bestindex); } } /* Returns the number of different cases of index variables. If last two index variables are equal, there is no any enhancement of clustering. */ static int kmeans_check_converge (gsl_vector_int * current, gsl_vector_int * old) { int i; int total = 0; for (i = 0; i < current->size; i++) { if (current->data[i] == old->data[i]) total++; old->data[i] = current->data[i]; } return (current->size - total); } /* static gsl_matrix * kmeans_get_group (struct Kmeans *kmeans, int index) { int i; int j = 0; int elm = kmeans_num_elements_group (kmeans, index); gsl_matrix *agroup = gsl_matrix_alloc (elm, kmeans->data->size2); gsl_vector *v1 = gsl_vector_alloc (kmeans->data->size2); for (i = 0; i < kmeans->data->size1; i++) { if (kmeans->index->data[i] == index) { gsl_matrix_get_row (v1, kmeans->data, i); gsl_matrix_set_row (agroup, j, v1); j++; } } return (agroup); } */ /* Main algorithm. Does iterations, checks convergency */ static void kmeans_cluster (struct Kmeans *kmeans) { int i; bool redo; gsl_vector_int *oldindex = gsl_vector_int_alloc (kmeans->index->size); cluster: redo = false; kmeans_randomize_centers (kmeans); for (kmeans->lastiter = 0; kmeans->lastiter < kmeans->maxiter; kmeans->lastiter++) { kmeans_calculate_indexes (kmeans); kmeans_recalculate_centers (kmeans); if (kmeans_check_converge (kmeans->index, oldindex) == 0) break; } for (i = 0; i < kmeans->ngroups; i++) { if (kmeans_num_elements_group (kmeans, i) == 0) { kmeans->trials++; redo = true; break; } } if (redo) goto cluster; } /* Reports centers of clusters. initial parameter is optional for future use. if initial is true, initial cluster centers are reported. Otherwise, resulted centers are reported. */ static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial) { struct tab_table *t; int nc, nr, heading_columns, currow; int i, j; nc = kmeans->ngroups + 1; nr = kmeans->data->size2 + 4; heading_columns = 1; t = tab_create (nc, nr); tab_headers (t, 0, nc - 1, 0, 1); currow = 0; if (!initial) { tab_title (t, _("Final Cluster Centers")); } else { tab_title (t, _("Initial Cluster Centers")); } tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1); tab_joint_text (t, 1, 0, nc - 1, 0, TAB_CENTER, _("Cluster")); tab_hline (t, TAL_1, 1, nc - 1, 2); currow += 2; for (i = 0; i < kmeans->ngroups; i++) { tab_text_format (t, (i + 1), currow, TAB_CENTER, "%d", (i + 1)); } currow++; tab_hline (t, TAL_1, 1, nc - 1, currow); currow++; for (i = 0; i < kmeans->data->size2; i++) { tab_text (t, 0, currow + i, TAB_LEFT, var_to_string (kmeans->variables[i])); } for (i = 0; i < kmeans->centers->size1; i++) { for (j = 0; j < kmeans->centers->size2; j++) { if (!initial) { tab_double (t, i + 1, j + 4, TAB_CENTER, gsl_matrix_get (kmeans->centers, i, j), var_get_print_format (kmeans->variables[j])); } else { tab_double (t, i + 1, j + 4, TAB_CENTER, gsl_matrix_get (kmeans->initial_centers, i, j), var_get_print_format (kmeans->variables[j])); } } } tab_submit (t); } /* Reports number of cases of each single cluster. */ static void quick_cluster_show_number_cases (struct Kmeans *kmeans) { struct tab_table *t; int nc, nr; int i, numelem; long int total; nc = 3; nr = kmeans->ngroups + 1; t = tab_create (nc, nr); tab_headers (t, 0, nc - 1, 0, 0); tab_title (t, _("Number of Cases in each Cluster")); tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1); tab_text (t, 0, 0, TAB_LEFT, _("Cluster")); total = 0; for (i = 0; i < kmeans->ngroups; i++) { tab_text_format (t, 1, i, TAB_CENTER, "%d", (i + 1)); numelem = kmeans_num_elements_group (kmeans, i); tab_text_format (t, 2, i, TAB_CENTER, "%d", numelem); total += numelem; } tab_text (t, 0, kmeans->ngroups, TAB_LEFT, _("Valid")); tab_text_format (t, 2, kmeans->ngroups, TAB_LEFT, "%ld", total); tab_submit (t); } /* Reports */ static void quick_cluster_show_results (struct Kmeans *kmeans) { //uncomment the line above for reporting initial centers //quick_cluster_show_centers (kmeans, true); quick_cluster_show_centers (kmeans, false); quick_cluster_show_number_cases (kmeans); } int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) { struct Kmeans *kmeans; bool ok; struct dictionary *dict = dataset_dict (ds); const struct variable **variables; struct casereader *cs; int groups = 2; int numobs = 0; int maxiter = 2; size_t p; if (!parse_variables_const (lexer, dict, &variables, &p, PV_NO_DUPLICATE | PV_NUMERIC)) { msg (ME, _("Variables cannot be parsed")); return (CMD_FAILURE); } if (lex_match (lexer, T_SLASH)) { if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "CLUSTERS")) { if (lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); groups = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } } else if (lex_match_id (lexer, "MXITER")) { if (lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); maxiter = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } } else { //further command set return (CMD_FAILURE); } } } } cs = proc_open (ds); numobs = casereader_count_cases (cs); if (groups > numobs) { printf (_("Number of groups cannot be larger than the number of cases.\n")); ok = casereader_destroy (cs); proc_commit (ds); return (CMD_FAILURE); } kmeans = kmeans_create (cs, variables, numobs, p, groups, maxiter); ok = casereader_destroy (cs); ok = proc_commit (ds) && ok; kmeans_cluster (kmeans); quick_cluster_show_results (kmeans); return (ok); }