/* PSPP - a program for statistical analysis. Copyright (C) 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid struct quick_cluster { const char **varNames; int numGroups; }; /* Struct KMeans: Holds all of the information for the functions. */ struct Kmeans { gsl_matrix *data; //User Data (Given by the user) gsl_matrix *centers; //Centers for groups gsl_vector_int *index; //Integer values from zero to ngroups. Shows group of an observation. gsl_vector *v1,*v2,*v3; //Temporary vector for program. Do not use. int ngroups; //Number of group. (Given by the user) int n; //Number of observations. (Given by the user) int m; //Number of observations. (Given by the user) int maxiter; //Maximum number of iterations (Given by the user) int lastiter; //Show at which iteration it found the solution. int trials; //If not convergence, how many times has clustering done. double *weights; //Double values for handling weights for program use. }; struct Kmeans* kmeans_create(double* data, int n, int m, int ngroups, int maxiter){ int i,j; struct Kmeans *k = (struct Kmeans*) malloc(sizeof(struct Kmeans)); k->data=gsl_matrix_alloc(n,m); k->centers=gsl_matrix_alloc(ngroups, m); k->ngroups=ngroups; k->index=gsl_vector_int_alloc(n); k->n=n; k->m=m; k->maxiter=maxiter; k->lastiter=0; k->trials=0; for (i=0;idata, i, j, data[i * m +j]); } } k->weights = (double*)malloc(sizeof(double) * k->index->size); k->v1 = gsl_vector_alloc(k->centers->size2); k->v2 = gsl_vector_alloc(k->centers->size2); k->v3 = gsl_vector_alloc(k->n); return(k); } void kmeans_randomize_centers(struct Kmeans *kmeans){ int i,j; int randind; double min=0,max=0; min=gsl_matrix_min(kmeans->data); max=gsl_matrix_max(kmeans->data); for (i=0;icenters->size1;i++){ randind=(int)((((double)rand())/RAND_MAX)*kmeans->data->size1); for (j=0;jcenters->size2; j++){ //gsl_matrix_set(kmeans->centers, i, j, min + (((double)rand())/RAND_MAX)*(max-min)); gsl_matrix_set(kmeans->centers,i,j, gsl_matrix_get(kmeans->data, randind,j)); } } } void kmeans_randomize_index(struct Kmeans *kmeans){ int i; for (i=0;iindex->size;i++){ kmeans->index->data[i]=-1; } } void kmeans_print(struct Kmeans* kmeans){ int i,j; printf("Number of groups: %d\n",kmeans->ngroups); printf("Centers:\n"); for (i=0;icenters->size1;i++) { for (j=0;jcenters->size2;j++){ printf("%f ",gsl_matrix_get(kmeans->centers, i,j)); } printf("\n"); } printf("Index:\n"); for (i=0;in;i++){ printf("%d ",gsl_vector_int_get(kmeans->index, i)); } printf("\nLast iter: %d\n",kmeans->lastiter); } void print_matrix(gsl_matrix *m){ int i,j; for (i=0;isize1;i++){ for (j=0;jsize2;j++){ printf("%f ",m->data[i * m->size2 + j]); } printf("\n"); } } double kmeans_euclidean_distance(gsl_vector *v1, gsl_vector *v2){ double result=0.0; double val; int i; for (i=0;isize;i++){ val=v1->data[i] - v2->data[i]; result+=val*val; } return(result); } int kmeans_num_elements_group(struct Kmeans *kmeans, int group){ int total=0; int i; for (i=0;in;i++){ if(gsl_vector_int_get(kmeans->index,i)==group) total++; } return(total); } void kmeans_recalculate_centers(struct Kmeans *kmeans){ int i,j,h; int elm; double mean; gsl_vector *v1=kmeans->v3; for (i=0;ingroups;i++){ elm=kmeans_num_elements_group(kmeans,i); for (j=0;jindex->size;j++){ if(gsl_vector_int_get(kmeans->index,j)==i){ kmeans->weights[j]=1.0; }else{ kmeans->weights[j]=0.0; } } for (h=0;hm;h++){ gsl_matrix_get_col(v1,kmeans->data, h); mean=gsl_stats_wmean(kmeans->weights, 1, v1->data ,1, v1->size); gsl_matrix_set(kmeans->centers, i,h, mean); } } } void kmeans_calculate_indexes(struct Kmeans *kmeans){ double dist; double mindist; int bestindex=0; int i,j; gsl_vector *v1 = kmeans->v1; gsl_vector *v2 = kmeans->v2; for (i=0;in; i++){ mindist=INFINITY; gsl_matrix_get_row(v1, kmeans->data, i); for (j=0;jngroups; j++){ gsl_matrix_get_row(v2, kmeans->centers,j); dist=kmeans_euclidean_distance(v1,v2); if(distindex,i,bestindex); } } int kmeans_check_converge(gsl_vector_int *current, gsl_vector_int *old, struct Kmeans *kmeans){ int i; int total=0; for (i=0;isize;i++) { if(current->data[i] == old->data[i]) total++; old->data[i]=current->data[i]; } return(current->size-total); } gsl_matrix* kmeans_getGroup(struct Kmeans *kmeans, int index){ int i; int j=0; int elm=kmeans_num_elements_group(kmeans,index); gsl_matrix *agroup=gsl_matrix_alloc(elm, kmeans->data->size2); gsl_vector *v1=gsl_vector_alloc(kmeans->data->size2); for(i=0;idata->size1;i++){ if(kmeans->index->data[i]==index){ gsl_matrix_get_row(v1, kmeans->data, i); gsl_matrix_set_row(agroup, j, v1); j++; } } return(agroup); } void kmeans_cluster(struct Kmeans *kmeans){ int i; double *ind; double sum; bool redo; gsl_vector_int *oldindex = gsl_vector_int_alloc(kmeans->index->size); cluster: redo=false; kmeans_randomize_centers(kmeans); for (kmeans->lastiter=0;kmeans->lastitermaxiter;kmeans->lastiter++){ kmeans_calculate_indexes(kmeans); kmeans_recalculate_centers(kmeans); if(kmeans_check_converge(kmeans->index, oldindex, kmeans)==0) break; } for (i=0;ingroups;i++){ if(kmeans_num_elements_group(kmeans,i)==0) { kmeans->trials++; redo=true; break; } } if(redo) goto cluster; } void quick_cluster_show_results(struct Kmeans* kmeans){ int i,j; printf("Number of cases: %d\n",kmeans->n); printf("Number of variables: %d\n",kmeans->m); printf("Number of groups: %d\n",kmeans->ngroups); printf("Number of trials: %d\n",(kmeans->trials+1)); printf("Number of iterations at last trial: %d\n",(kmeans->lastiter+1)); printf("Centers:\n"); for (i=0;icenters->size1;i++){ printf("Center of Group %d: ",(i+1)); for(j=0;jcenters->size2;j++){ printf("%0.3f ",kmeans->centers->data[i * kmeans->centers->size2 + j]); } printf("\n"); } printf("Data Index:\n"); for (int i=0;iindex->size;i++){ printf("%3d ",kmeans->index->data[i]+1); } printf("\n"); } int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) { struct Kmeans *kmeans; double* data; struct ccase *c; bool ok; struct dictionary *dict = dataset_dict(ds); int n; struct variable **variables; struct casereader *input, *inputnew; int groups=0; int numobs=0; int maxiter=0; struct quick_cluster *qc=(struct quick_cluster*)malloc(sizeof(struct quick_cluster)); int i,j; parse_variables_const (lexer, dict, &variables, &n ,PV_NO_DUPLICATE | PV_NUMERIC); if(lex_match (lexer, T_SLASH)){ if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "CLUSTERS")) { if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); groups = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } }else if(lex_match_id(lexer, "MXITER")){ if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); maxiter = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } }else{ printf("Error parsing command.\n"); return(CMD_FAILURE); } } } } inputnew=proc_open (ds); numobs=casereader_count_cases(inputnew); if(groups>numobs){ printf("Number of groups cannot be larger than the number of cases.\n"); ok = casereader_destroy (inputnew); proc_commit (ds); return(CMD_FAILURE); } data=(double*)malloc(numobs * n * sizeof(double)); i=0;j=0; for (; (c = casereader_read (inputnew)) != NULL; case_unref (c)) { int v; double x; for (v = 0; v < n; ++v) { x = case_data (c, variables[v])->f; data[i * n + v] = x; } i++; } ok = casereader_destroy (inputnew); ok = proc_commit (ds) && ok; kmeans=kmeans_create(data, numobs, n, groups, maxiter); kmeans_cluster(kmeans); quick_cluster_show_results(kmeans); return(ok); }