/* PSPP - a program for statistical analysis.
   Copyright (C) 2009, 2010 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */

#include <config.h>

#include <math.h>

#include <libpspp/misc.h>

#include <libpspp/str.h>
#include <libpspp/message.h>


#include <data/procedure.h>
#include <data/missing-values.h>
#include <data/casereader.h>
#include <data/casegrouper.h>
#include <data/dictionary.h>
#include <data/format.h>

#include <language/lexer/variable-parser.h>
#include <language/command.h>
#include <language/lexer/lexer.h>

#include <output/tab.h>
#include <output/text-item.h>

#include <stdio.h>
#include <stdlib.h>

#include <gsl/gsl_matrix.h>
#include <gsl/gsl_statistics.h>

#include <gsl/gsl_rng.h>
#include <math/random.h>

#include "gettext.h"
#define _(msgid) gettext (msgid)
#define N_(msgid) msgid

/*
Struct KMeans:
Holds all of the information for the functions.
*/
struct Kmeans
{
  gsl_matrix *data;		//User Data (Given by the user)
  gsl_matrix *centers;		//Centers for groups
  gsl_vector_int *index;	//Integer values from zero to ngroups. Shows group of an observation.
  gsl_vector *v1, *v2, *v3;	//Temporary vector for program. Do not use.
  gsl_rng *rng;			//Random Number Generator.
  int ngroups;			//Number of group. (Given by the user)
  int n;			//Number of observations. (Given by the user)
  int m;			//Number of variables. (Given by the user)
  int maxiter;			//Maximum number of iterations (Given by the user)
  int lastiter;			//Show at which iteration it found the solution.
  int trials;			//If not convergence, how many times has clustering done.
  double *weights;		//Double values for handling weights for program use.
  gsl_matrix *initial_centers;	//Initial random centers
  const struct variable **variables;	//Variables
};


/*
Creates and returns a struct of Kmeans with given casereader 'cs', parsed variables 'variables',
number of cases 'n', number of variables 'm', number of clusters and amount of maximum iterations.
*/
static struct Kmeans *
kmeans_create (struct casereader *cs, const struct variable **variables,
	       int n, int m, int ngroups, int maxiter)
{
  int i, v;
  double x;
  struct ccase *c;
  struct Kmeans *kmeans = xmalloc (sizeof (struct Kmeans));
  kmeans->data = gsl_matrix_alloc (n, m);
  kmeans->centers = gsl_matrix_alloc (ngroups, m);
  kmeans->ngroups = ngroups;
  kmeans->index = gsl_vector_int_alloc (n);
  kmeans->n = n;
  kmeans->m = m;
  kmeans->maxiter = maxiter;
  kmeans->lastiter = 0;
  kmeans->trials = 0;
  kmeans->variables = variables;
  i = 0;
  for (; (c = casereader_read (cs)) != NULL; case_unref (c))
    {
      for (v = 0; v < m; ++v)
	{
	  x = case_data (c, variables[v])->f;
	  /*
	     I think using k->data[i*p+v]=x is faster
	     but for being convenient, i am using gsl_matrix_set() here.
	   */
	  gsl_matrix_set (kmeans->data, i, v, x);
	}
      i++;
    }
  kmeans->weights = xmalloc (sizeof (double) * kmeans->index->size);
  kmeans->v1 = gsl_vector_alloc (kmeans->centers->size2);
  kmeans->v2 = gsl_vector_alloc (kmeans->centers->size2);
  kmeans->v3 = gsl_vector_alloc (kmeans->n);
  kmeans->rng = get_rng ();
  kmeans->initial_centers = NULL;
  return (kmeans);
}

/*
Creates random centers using randomly selected cases from the data.
*/
static void
kmeans_randomize_centers (struct Kmeans *kmeans)
{
  int i, j;
  int randind;
  for (i = 0; i < kmeans->centers->size1; i++)
    {
      randind = (int) (gsl_rng_uniform (kmeans->rng) * kmeans->data->size1);
      for (j = 0; j < kmeans->centers->size2; j++)
	{
	  gsl_matrix_set (kmeans->centers, i, j,
			  gsl_matrix_get (kmeans->data, randind, j));
	}
    }

/*
If it is the first iteration, the variable kmeans->initial_centers is NULL and
it is created once for reporting issues. In SPSS, initial centers are shown in the reports
but in PSPP it is not shown now. I am leaving it here.
*/
  if (!kmeans->initial_centers)
    {
      kmeans->initial_centers =
	gsl_matrix_alloc (kmeans->centers->size1, kmeans->centers->size2);
      for (i = 0; i < kmeans->centers->size1; i++)
	{
	  for (j = 0; j < kmeans->centers->size2; j++)
	    {
	      gsl_matrix_set (kmeans->initial_centers, i, j,
			      gsl_matrix_get (kmeans->centers, i, j));
	    }
	}
    }
}

/*
kmeans_randomize_index() was used at the older versions of quick_cluster
but we are not using it now. I am leaving it here.
*/
/*
static void
kmeans_randomize_index (struct Kmeans *kmeans)
{
  int i;
  for (i = 0; i < kmeans->index->size; i++)
    {
      kmeans->index->data[i] = -1;
    }
}
*/


/*
kmeans_print() may be used for debugging issues.
*/
/*
static void
kmeans_print (struct Kmeans *kmeans)
{
  int i, j;
  printf ("Number of groups: %d\n", kmeans->ngroups);
  printf ("Centers:\n");
  for (i = 0; i < kmeans->centers->size1; i++)
    {
      for (j = 0; j < kmeans->centers->size2; j++)
	{
	  printf ("%f ", gsl_matrix_get (kmeans->centers, i, j));
	}
      printf ("\n");
    }

  printf ("Index:\n");
  for (i = 0; i < kmeans->n; i++)
    {
      printf ("%d ", gsl_vector_int_get (kmeans->index, i));
    }
  printf ("\nLast iter: %d\n", kmeans->lastiter);
}
*/

/*
Prints a gsl_matrix.
I am leaving it here for debugging issues.
*/
/*
static void
print_matrix (gsl_matrix * m)
{
  int i, j;
  for (i = 0; i < m->size1; i++)
    {
      for (j = 0; j < m->size2; j++)
	{
	  printf ("%f ", m->data[i * m->size2 + j]);
	}
      printf ("\n");
    }
}
*/


/*
Calculates the squared euclidean distances between two vectors.
Square root is not used here for decreasing computation times.
Comparing squared distances equals to comparing non-squared ones.
*/
static double
kmeans_euclidean_distance (gsl_vector * v1, gsl_vector * v2)
{
  double result = 0.0;
  double val;
  int i;
  for (i = 0; i < v1->size; i++)
    {
      val = v1->data[i] - v2->data[i];
      result += val * val;
    }
  return (result);
}


/*
Returns the actual number of cases of a cluster
*/
static int
kmeans_num_elements_group (struct Kmeans *kmeans, int group)
{
  int total = 0;
  int i;
  for (i = 0; i < kmeans->n; i++)
    {
      if (gsl_vector_int_get (kmeans->index, i) == group)
	total++;
    }
  return (total);
}


/*
Re-calculates the cluster centers
*/
static void
kmeans_recalculate_centers (struct Kmeans *kmeans)
{
  int i, j, h;
  int elm;
  double mean;
  gsl_vector *v1 = kmeans->v3;

  for (i = 0; i < kmeans->ngroups; i++)
    {
      elm = kmeans_num_elements_group (kmeans, i);
      for (j = 0; j < kmeans->index->size; j++)
	{
	  if (gsl_vector_int_get (kmeans->index, j) == i)
	    {
	      kmeans->weights[j] = 1.0;
	    }
	  else
	    {
	      kmeans->weights[j] = 0.0;
	    }
	}

      for (h = 0; h < kmeans->m; h++)
	{
	  gsl_matrix_get_col (v1, kmeans->data, h);
	  mean = gsl_stats_wmean (kmeans->weights, 1, v1->data, 1, v1->size);
	  gsl_matrix_set (kmeans->centers, i, h, mean);
	}
    }
}

/*
The variable index in struct Kmeans holds integer values that represents the current groups of cases.
index[n]=a shows the nth case is belong to ath cluster.
*/
static void
kmeans_calculate_indexes (struct Kmeans *kmeans)
{
  double dist;
  double mindist;
  int bestindex = 0;
  int i, j;
  gsl_vector *v1 = kmeans->v1;
  gsl_vector *v2 = kmeans->v2;

  for (i = 0; i < kmeans->n; i++)
    {
      mindist = INFINITY;
      gsl_matrix_get_row (v1, kmeans->data, i);
      for (j = 0; j < kmeans->ngroups; j++)
	{
	  gsl_matrix_get_row (v2, kmeans->centers, j);
	  dist = kmeans_euclidean_distance (v1, v2);
	  if (dist < mindist)
	    {
	      mindist = dist;
	      bestindex = j;
	    }
	}
      gsl_vector_int_set (kmeans->index, i, bestindex);
    }
}

/*
Returns the number of different cases of index variables.
If last two index variables are equal, there is no any enhancement of clustering.
*/
static int
kmeans_check_converge (gsl_vector_int * current, gsl_vector_int * old)
{
  int i;
  int total = 0;
  for (i = 0; i < current->size; i++)
    {
      if (current->data[i] == old->data[i])
	total++;
      old->data[i] = current->data[i];
    }
  return (current->size - total);
}


/*
static gsl_matrix *
kmeans_get_group (struct Kmeans *kmeans, int index)
{
  int i;
  int j = 0;
  int elm = kmeans_num_elements_group (kmeans, index);
  gsl_matrix *agroup = gsl_matrix_alloc (elm, kmeans->data->size2);
  gsl_vector *v1 = gsl_vector_alloc (kmeans->data->size2);
  for (i = 0; i < kmeans->data->size1; i++)
    {
      if (kmeans->index->data[i] == index)
	{
	  gsl_matrix_get_row (v1, kmeans->data, i);
	  gsl_matrix_set_row (agroup, j, v1);
	  j++;
	}
    }
  return (agroup);
}
*/


/*
Main algorithm.
Does iterations, checks convergency
*/
static void
kmeans_cluster (struct Kmeans *kmeans)
{
  int i;
  bool redo;
  gsl_vector_int *oldindex = gsl_vector_int_alloc (kmeans->index->size);

cluster:
  redo = false;
  kmeans_randomize_centers (kmeans);
  for (kmeans->lastiter = 0; kmeans->lastiter < kmeans->maxiter;
       kmeans->lastiter++)
    {
      kmeans_calculate_indexes (kmeans);
      kmeans_recalculate_centers (kmeans);
      if (kmeans_check_converge (kmeans->index, oldindex) == 0)
	break;
    }

  for (i = 0; i < kmeans->ngroups; i++)
    {
      if (kmeans_num_elements_group (kmeans, i) == 0)
	{
	  kmeans->trials++;
	  redo = true;
	  break;
	}
    }
  if (redo)
    goto cluster;

}


/*
Reports centers of clusters.
initial parameter is optional for future use.
if initial is true, initial cluster centers are reported. Otherwise, resulted centers are reported.
*/
static void
quick_cluster_show_centers (struct Kmeans *kmeans, bool initial)
{
  struct tab_table *t;
  int nc, nr, heading_columns, currow;
  int i, j;
  nc = kmeans->ngroups + 1;
  nr = kmeans->data->size2 + 4;
  heading_columns = 1;
  t = tab_create (nc, nr);
  tab_headers (t, 0, nc - 1, 0, 1);
  currow = 0;
  if (!initial)
    {
      tab_title (t, _("Final Cluster Centers"));
    }
  else
    {
      tab_title (t, _("Initial Cluster Centers"));
    }
  tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
  tab_joint_text (t, 1, 0, nc - 1, 0, TAB_CENTER, _("Cluster"));
  tab_hline (t, TAL_1, 1, nc - 1, 2);
  currow += 2;

  for (i = 0; i < kmeans->ngroups; i++)
    {
      tab_text_format (t, (i + 1), currow, TAB_CENTER, "%d", (i + 1));
    }
  currow++;
  tab_hline (t, TAL_1, 1, nc - 1, currow);
  currow++;
  for (i = 0; i < kmeans->data->size2; i++)
    {
      tab_text (t, 0, currow + i, TAB_LEFT,
		var_to_string (kmeans->variables[i]));
    }

  for (i = 0; i < kmeans->centers->size1; i++)
    {
      for (j = 0; j < kmeans->centers->size2; j++)
	{
	  if (!initial)
	    {
	      tab_double (t, i + 1, j + 4, TAB_CENTER,
			  gsl_matrix_get (kmeans->centers, i, j),
			  var_get_print_format (kmeans->variables[j]));
	    }
	  else
	    {
	      tab_double (t, i + 1, j + 4, TAB_CENTER,
			  gsl_matrix_get (kmeans->initial_centers, i, j),
			  var_get_print_format (kmeans->variables[j]));
	    }
	}
    }
  tab_submit (t);
}


/*
Reports number of cases of each single cluster.
*/
static void
quick_cluster_show_number_cases (struct Kmeans *kmeans)
{
  struct tab_table *t;
  int nc, nr;
  int i, numelem;
  long int total;
  nc = 3;
  nr = kmeans->ngroups + 1;
  t = tab_create (nc, nr);
  tab_headers (t, 0, nc - 1, 0, 0);
  tab_title (t, _("Number of Cases in each Cluster"));
  tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
  tab_text (t, 0, 0, TAB_LEFT, _("Cluster"));

  total = 0;
  for (i = 0; i < kmeans->ngroups; i++)
    {
      tab_text_format (t, 1, i, TAB_CENTER, "%d", (i + 1));
      numelem = kmeans_num_elements_group (kmeans, i);
      tab_text_format (t, 2, i, TAB_CENTER, "%d", numelem);
      total += numelem;
    }

  tab_text (t, 0, kmeans->ngroups, TAB_LEFT, _("Valid"));
  tab_text_format (t, 2, kmeans->ngroups, TAB_LEFT, "%ld", total);
  tab_submit (t);
}

/*
Reports
*/
static void
quick_cluster_show_results (struct Kmeans *kmeans)
{
  //uncomment the line above for reporting initial centers
  //quick_cluster_show_centers (kmeans, true);
  quick_cluster_show_centers (kmeans, false);
  quick_cluster_show_number_cases (kmeans);
}


int
cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
{
  struct Kmeans *kmeans;
  bool ok;
  struct dictionary *dict = dataset_dict (ds);
  const struct variable **variables;
  struct casereader *cs;
  int groups = 2;
  int numobs = 0;
  int maxiter = 2;
  size_t p;

  if (!parse_variables_const (lexer, dict, &variables, &p,
			      PV_NO_DUPLICATE | PV_NUMERIC))
    {
      msg (ME, _("Variables cannot be parsed"));
      return (CMD_FAILURE);
    }


  if (lex_match (lexer, T_SLASH))
    {
      if (lex_match_id (lexer, "CRITERIA"))
	{
	  lex_match (lexer, T_EQUALS);
	  while (lex_token (lexer) != T_ENDCMD
		 && lex_token (lexer) != T_SLASH)
	    {
	      if (lex_match_id (lexer, "CLUSTERS"))
		{
		  if (lex_force_match (lexer, T_LPAREN))
		    {
		      lex_force_int (lexer);
		      groups = lex_integer (lexer);
		      lex_get (lexer);
		      lex_force_match (lexer, T_RPAREN);
		    }
		}
	      else if (lex_match_id (lexer, "MXITER"))
		{
		  if (lex_force_match (lexer, T_LPAREN))
		    {
		      lex_force_int (lexer);
		      maxiter = lex_integer (lexer);
		      lex_get (lexer);
		      lex_force_match (lexer, T_RPAREN);
		    }
		}
	      else
		{
		  //further command set
		  return (CMD_FAILURE);
		}
	    }
	}
    }


  cs = proc_open (ds);
  numobs = casereader_count_cases (cs);

  if (groups > numobs)
    {
      printf
	(_("Number of groups cannot be larger than the number of cases.\n"));
      ok = casereader_destroy (cs);
      proc_commit (ds);
      return (CMD_FAILURE);
    }

  kmeans = kmeans_create (cs, variables, numobs, p, groups, maxiter);
  ok = casereader_destroy (cs);
  ok = proc_commit (ds) && ok;

  kmeans_cluster (kmeans);
  quick_cluster_show_results (kmeans);

  return (ok);
}