(* :Title: Data Manipulation *)

(* :Author: Wolfram Research, Inc. *)

(* :Summary:
This package provides an extension of the list manipulation
functions that are built-in to Mathematica.  Additional functions
useful for manipulating statistical data include frequency counting
and computing cumulative sums.
*)

(* :Context: Statistics`DataManipulation` *)

(* :Package Version: 1.4 *)

(* :Copyright: Copyright 1990-2005, Wolfram Research, Inc. *)

(* :History:
	1.1: original version, 1990.
	1.2: improved speed of Frequencies using V3.0 function Split
		(contribution of Pavel Zaruba), improved speed of
		BinCounts and RangeCounts, ECM, 1997.
	1.3: improved efficiency for a number of functions, 
		Darren Glosemeyer, 2004.
	1.4 obsoleted BooleanSelect which is superceded by Pick,
	    Darren Glosemeyer, 2005.
*)

(* :Keywords: *)

(* :Source: None. *)

(* :Warning: Expands the definition of Column. *)

(* :Mathematica Version: 5.2 *)

(* :Limitation: None known. *)

(* :Discussion: *)


BeginPackage["Statistics`DataManipulation`"]

Column::usage = "Column[data, n] gives the n-th column in data.
Column[data, {n1, n2, ...}] gives a list of columns n1, n2, ..."

ColumnTake::usage =
"ColumnTake[data, spec] takes the specified columns in data."

ColumnDrop::usage =
"ColumnDrop[data, spec] drops the specified columns in data."

ColumnJoin::usage =
"ColumnJoin[data1, data2, ...] joins elements in corresponding
columns in the datai."

RowJoin::usage =
"RowJoin[data1, data2, ...] joins elements in corresponding rows
in the datai."

DropNonNumeric::usage =
"DropNonNumeric[data] drops elements or rows that contain non-numeric
elements in data."

DropNonNumericColumn::usage =
"DropNonNumericColumn[data] drops columns that contain non-numeric
elements in data."

BooleanSelect::usage =
"BooleanSelect[list, sel] keeps elements of list for which the
corresponding element of sel is True."

TakeWhile::usage =
"TakeWhile[list, pred] takes elements from the beginning of list
while pred is True."

LengthWhile::usage =
"LengthWhile[list, pred] gives the contiguous elements starting
from the beginning of list, for which pred is True."

Frequencies::usage =
"Frequencies[list] gives a list of the distinct elements in list,
together with the frequencies with which they occur."

QuantileForm::usage =
"QuantileForm[list] sorts the elements in list, then gives
a list of the elements, paired with their quantile position."

CumulativeSums::usage =
"CumulativeSums[list] gives cumulative sums of list."

BinCounts::usage =
"BinCounts[{x1, x2, ...}, {xmin, xmax, dx}] gives a list of the number of
elements in the data {x1, x2, ...} that lie in bins from xmin to xmax in
steps  of dx.  The bin boundaries are {xmin < x <= xmin + dx, ...,
xmax - dx < x <= xmax}.  BinCounts[{{x1, y1}, {x2, y2}, ...}, {xmin, xmax, dx},
{ymin, ymax, dy}] gives a 2-dimensional array of bin counts for the
bivariate data {{x1, y1}, {x2, y2}, ...}.  In general, BinCounts gives a
p-dimensional array of bin counts for p-variate data."

BinLists::usage =
"BinLists[{x1, x2, ...}, {xmin, xmax, dx}] gives lists of the elements in
the data {x1, x2, ...} that lie in bins from xmin to xmax in steps of dx.
BinLists[{{x1, y1}, {x2, y2}, ...}, {xmin, xmax, dx}, {ymin, ymax, dy}] gives
a 2-dimensional array of bin lists for the bivariate data {{x1, y1}, {x2, y2},
 ...}.  In general, BinLists gives a p-dimensional array of bin lists for
p-variate data."

RangeCounts::usage =
"RangeCounts[{x1, x2, ...}, {c1, c2, ..., cm}] gives a list of the number of
elements in the data {x1, x2, ...} that lie between successive cutoffs.
The range boundaries are {x < c1, c1 <= x < c2, ..., x >= cm}.
RangeCounts[{{x1, y1}, {x2, y2}, ...}, {xc1, xc2, ..., xcm},
{yc1, yc2, ..., ycn}] gives a 2-dimensional array of range counts for the
bivariate data {{x1, y1}, {x2, y2}, ...}.  In general, RangeCounts gives a
p-dimensional array of range counts for p-variate data."

RangeLists::usage =
"RangeLists[{x1, x2, ...}, {c1, c2, ..., cm}] gives lists of the elements in the
data {x1, x2, ...} that lie between successive cutoffs.  RangeLists[{{x1, y1},
{x2, y2}, ...}, {xc1, xc2, ...}, {yc1, yc2, ...}] gives a 2-dimensional array
of range lists for the bivariate data {{x1, y1}, {x2, y2}, ...}.  In general,
RangeLists gives a p-dimensional array of range lists for p-variate data."

CategoryCounts::usage =
"CategoryCounts[{x1, x2, ...}, {e1, e2, ...}] gives a list containing the
number of elements in the data {x1, x2, ...} that match each of the ei.
CategoryCounts[{x1, x2, ...}, {{e11, e12, ...}, {e21, e22, ...}, ...}] gives a
list of the number of elements in the data {x1, x2, ...} that match any of the
elements in each list {ei1, ei2, ...}.  CategoryCounts[{{x1, y1}, {x2, y2},
 ...}, {xe1, xe2, ...}, {ye1, ye2, ...}] and CategoryCounts[{{x1, y1},
{x2, y2}, ...}, {{xe11, xe12, ...}, {xe21, xe22, ...}, ...},
{{ye11, ye12, ...}, {ye21, ye22, ...}, ...}] both give 2-dimensional arrays of
category counts for the bivariate data {{x1, y1}, {x2, y2}, ...}.  In general,
CategoryCounts gives a p-dimensional array of category counts for p-variate
data."

CategoryLists::usage =
"CategoryLists[{x1, x2, ...}, {e1, e2, ...}] gives lists of the elements in the
data {x1, x2, ...} that match each of the ei.  CategoryLists[{x1, x2, ...},
{{e11, e12, ...}, {e21, e22, ...}, ...}] gives lists of the elements in the
data {x1, x2, ...} that match any of the elements in each list {ei1, ei2, ...}.
CategoryLists[{{x1, y1}, {x2, y2}, ...}, {xe1, xe2, ...}, {ye1, ye2, ...}]
and CategoryLists[{{x1, y1}, {x2, y2}, ...}, {{xe11, xe12, ...},
{xe21, xe22, ...}, ...}, {{ye11, ye12, ...}, {ye21, ye22, ...}, ...}] both
give 2-dimensional arrays of category lists for the bivariate data {{x1, y1},
{x2, y2}, ...}.  In general, CategoryLists gives a p-dimensional array of
category lists for p-variate data."

Unprotect[Column,ColumnTake,ColumnDrop,ColumnJoin,RowJoin,DropNonNumeric,
DropNonNumericColumn,BooleanSelect,TakeWhile,LengthWhile,
Frequencies,QuantileForm,CumulativeSums,BinCounts,RangeCounts,
CategoryCounts,RangeLists,BinLists,CategoryLists];

Begin["`Private`"]

Unprotect[Column]

Column[data:{___List}, n_Integer] := data[[All,n]]

Column[data:{___List}, ni:{__Integer}] := data[[All,ni]]

Protect[Column]

ColumnTake[data:{___List}, spec_] := Take[data, All, spec]

ColumnDrop[data:{___List}, spec_] := Drop[data, None, spec]

ColumnJoin[data:{___List}..] := Join[data]

RowJoin[data:{___List}..] := Apply[Join, Transpose[{data}], {1}] /;
				Equal @@ Map[Length, {data}]

DropNonNumeric[data_List] := Select[data, NumberQ] /; VectorQ[data]

DropNonNumeric[data:{___List}] := Select[data, Apply[And, Map[NumberQ, #]]&]

DropNonNumericColumn[data:{___List}] := 
	Module[{d = DropNonNumeric[Transpose[data]]},
		If[d==={},
		   {},
		   Transpose[d]
		]
        ]

booleanSelectMessage=True;

BooleanSelect::obslt="BooleanSelect is obsolete.  Use Pick instead.";

(* BooleanSelect is superceded by Pick in 5.1;  issue a message on first usage to warn users *)
BooleanSelect[list_List, sel_List] := (
	If[TrueQ[booleanSelectMessage],
		Message[BooleanSelect::obslt];
		booleanSelectMessage=False];
	Pick[list,sel]) /; Length[list] == Length[sel]

TakeWhile[list_List, pred_] :=
        Take[list, NestWhile[#1 + 1 &, 1, pred[list[[#]]] &, 1, Length[list]] - 1]

LengthWhile[list_List, pred_] :=
        NestWhile[#1 + 1 &, 1, pred[list[[#]]] &, 1, Length[list]] - 1

QuantileForm[list_List] :=
        Transpose[{Range[Length[list]]/Length[list], Sort[list]}] /;
                        VectorQ[list] (* VectorQ check because the
				sample Quantile is not easily defined for
				multidimensional data. *)

CumulativeSums[list_List] := FoldList[Plus,First[list],Drop[list,1]]


(* ================================ Frequencies ============================ *)

Frequencies[list_List] :=
	{Length[#],First[#]}&/@Split[Sort[list]]

(* Pavel Zaruba's suggestion for versions 2.2 and earlier versions that do not
	 support Split:
  Frequencies[list_List]:=
       Module[{freq,p,f},
               freq={}; p=list; f=0;
               Scan[If[p=!=#,freq={freq,f};p=#;f=1,++f]&,Sort[list]];
               Transpose[{Rest[Flatten[{freq, f}]],Union[list]}]]
*)

(* ============================ BinCounts ================================= *)

BinCounts[list_?(VectorQ[#, NumberQ]&), {xmin_?NumberQ, xmax_?NumberQ, dx_:1}] :=
        Module[  { nbin, vals, split, split1, counts, i } ,
                nbin = Ceiling[(xmax - xmin)/dx] ;
                vals = Ceiling[(list - xmin)/dx] ;
		split = Split[Sort[vals]];
		split1 = {};
                Scan[(If[1 <= First[#] <= nbin, AppendTo[split1, #]])&, split];
		counts = Array[0&, nbin];  i = 1;
		Scan[(While[First[#] != i, i++];
		      counts[[i]] = Length[#];
		      i++)&, split1];
		counts
        ] /; xmax > xmin && dx > 0


BinCounts[list_?(MatrixQ[#, NumberQ]&),
                bins:{_?NumberQ, _?NumberQ, Optional[_, 1]}..] :=
        Module[ { tbins, nbins, vals, n = Length[{bins}],
		  split, counts, i },
                tbins = Map[If[Length[#]==3, #, Append[#, 1]]&, {bins}] ;
                nbins = Map[Ceiling[(#[[2]] - #[[1]])/#[[3]]]&, tbins] ;
                vals = Transpose[
                        Ceiling[(Transpose[list] - First /@ tbins)
                                                        / (Last /@ tbins) ]] ;
		split = Split[Sort[vals]];
		 i = Array[1&, n];
		split = Select[split, And @@ Thread[i <= First[#] <= nbins] &];
		counts = Array[0&, nbins];
		Scan[(
		      While[(
			     First[#] =!= i),
			    i = increment[i, nbins, 1]]; (* next bin *)
		      counts = ReplacePart[counts, Length[#], i];
		      i = increment[i, nbins, 1])&, split];
		counts	
        ] /; Dimensions[list][[2]] == Length[{bins}] &&
	Apply[And, Map[(#[[2]] > #[[1]] && If[Length[#] == 3,
                        #[[3]] > 0, True])&, {bins}]]


(* =============================== BinLists =============================== *)

BinLists[list_?(VectorQ[#, NumberQ]&), {xmin_?NumberQ, xmax_?NumberQ, dx_:1}] :=
        Module[  { nbin, vals, i } ,
                nbin = Ceiling[(xmax - xmin)/dx] ;
                vals = Ceiling[(list - xmin)/dx] ;
                Table[ list[[ Flatten[ Position[vals, i] ] ]], {i, nbin}]
        ] /; xmax > xmin && dx > 0

BinLists[list_?(MatrixQ[#, NumberQ]&),
                bins:{_?NumberQ, _?NumberQ, Optional[_, 1]}..] :=
        Module[ { tbins, nbins, vals },
                tbins = Map[If[Length[#]==3, #, Append[#, 1]]&, {bins}] ;
                nbins = Map[Ceiling[(#[[2]] - #[[1]])/#[[3]]]&, tbins] ;
                vals = Transpose[
                        Ceiling[(Transpose[list] - First /@ tbins)
                                                        / (Last /@ tbins) ]] ;
                Array[ list[[ Flatten[Position[vals, {##}]] ]]&, nbins]
        ] /; Dimensions[list][[2]] == Length[{bins}] &&
	Apply[And, Map[(#[[2]] > #[[1]] && If[Length[#] == 3,
                        #[[3]] > 0, True])&, {bins}]]


(* ============================ RangeCounts ================================ *)


RangeCounts[list_?(VectorQ[#, NumberQ]&), clist_?(VectorQ[#, NumberQ]&)] :=
        Module[  { tree, vals, nbin = Length[clist] + 1, split, counts, i},
		tree = MakeTree[clist] ;
		vals = Map[TreeFind[tree, #]&, list] ;
		split = Split[Sort[vals]];
		counts = Array[0&, nbin];  i = 0;
                Scan[(While[First[#] != i, i++];
                      counts[[i+1]] = Length[#];
                      i++)&, split];
                counts
        ] /; FreeQ[clist, Complex]

RangeCounts[list_?(MatrixQ[#, NumberQ]&), clists:(_?(VectorQ[#, NumberQ]&)...)] :=
	Module[ { trees, vals, f,
		  n = Length[{clists}], nbins = Map[Length, {clists}] + 1,
		  split, counts, i},
		trees = Map[MakeTree, {clists}] ;
		f = Map[Function[{x}, TreeFind[#, x]]&, trees];
		vals = Map[MapIndexed[Function[{y, z}, f[[z[[1]]]][y] ],
			 #]&, list];
		split = Split[Sort[vals]];
		counts = Array[0&, nbins];  i = Array[0&, n];
		Scan[(
		      While[(
			     First[#] =!= i),
			     i = increment[i, nbins, 0]]; (* next bin *)
		      counts = ReplacePart[counts, Length[#], i+1];
		      i = increment[i, nbins, 0])&, split]; 	
	        counts
	] /; Dimensions[list][[2]] == Length[{clists}]


(* ============================= RangeLists ============================== *)

RangeLists[list_?(VectorQ[#, NumberQ]&), clist_?(VectorQ[#, NumberQ]&)] := 
        Module[  { tree, vals, i },
                tree = MakeTree[clist] ; 
                vals = Map[TreeFind[tree, #]&, list] ;
                Table[ list[[ Flatten[Position[vals, i] ] ]], 
						{i, 0, Length[clist]}]
        ] 
 
RangeLists[list_?(MatrixQ[#, NumberQ]&), clists:(_?(VectorQ[#, NumberQ]&)...)] := 
        Module[ { trees, vals, f } , 
                trees = Map[MakeTree, {clists}] ;
                vals = Map[Thread[f[trees, #]]&, list] ;
                vals = vals /. f->TreeFind ;
                Array[ list[[ Flatten[Position[vals, {##}]] ]]&, 
					Map[Length, {clists}] + 1, 0]
        ] /; Dimensions[list][[2]] == Length[{clists}]


(* ============================== CategoryCounts =========================== *)

CategoryCounts[list_List, clist_?VectorQ] :=
		Map[Count[list, #]&, clist]

CategoryCounts[list_List, clist:{_List...}] :=
		Apply[Plus, Map[Count[list, #]&, clist, {2}], {1}]

CategoryCounts[list:{_List...}, clists__List] :=
	Map[Count[list, #]&, Outer[List, clists], {Length[{clists}]}] /;
		Dimensions[list][[2]] == Length[{clists}] &&
			And @@ (VectorQ /@ {clists})

CategoryCounts[list:{_List...}, clists:{_List...}..] :=
	Module[{ f, len, cl }, 
		cl = {clists} ;
		len = Length[cl] ;
		cl = Apply[f, cl, {2}] ;
		cl = Apply[Outer[List, ##]&, cl] /. f->List ;
	Apply[Plus, Map[CC0[list, #]&, cl, {len}], {len}] 
	] /; Dimensions[list][[2]] == Length[{clists}] 

CC2[elem_, cat_] := Apply[Or, Map[ MatchQ[elem, #]&, cat ]]

CC1[elems_, cats_] := 
	Module[ { f },
		Apply[And, Thread[ f[elems, cats] ] /. f->CC2 ]
	]

CC0[list_, c_] := 
	Count[list, _?(CC1[#, c]&)]

(* ============================== CategoryLists ============================ *)

CategoryLists[list_List, clist_List] :=
                Map[Cases[list, #]&, clist] /; VectorQ[clist]

CategoryLists[list_List, clist:{_List...}] :=
		Apply[Join, Map[Cases[list, #]&, clist, {2}], {1}]
 
CategoryLists[list_List, clists__List] :=
        Map[Cases[list, #]&, Outer[List, clists], {Length[{clists}]}] /;
                Dimensions[list][[2]] == Length[{clists}] &&
			And @@ (VectorQ /@ {clists})

CategoryLists[list_List, clists:{_List...}..] :=
        Module[{ f, len, cl }, 
                cl = {clists} ; 
                len = Length[cl] ; 
                cl = Apply[f, cl, {2}] ; 
                cl = Apply[Outer[List, ##]&, cl] /. f->List ;
	        Map[CC0L[list, #]&, cl, {len}] 
        ] /; Dimensions[list][[2]] == Length[{clists}]

CC0L[list_, c_] :=
	Select[list, CC1[#, c]&]

(* ======================================================================== *)
(* increment allows you to locate the next bin in a multidimensional
	array of bins, where "next" is the order in which Sort would
	sort the bins. *)
(* i is a list, denoting the current bin *)
(* max is a list, denoting the dimensions of the bin array *)
(* origin is an integer (typically 0 or 1),
	 indicating the origin of the bin indexing scheme  *)
(*
 increment[{1, 1, 1}, {2, 2, 2}, 1]                     {1, 1, 2}
 increment[{1, 1, 2}, {2, 2, 2}, 1]                     {1, 2, 1}
	...
 increment[{2, 2, 2}, {2, 2, 2}, 1]                     {1, 1, 1}
*)

increment[i_, max_, origin_] :=
 Block[{j = i},
	Scan[(If[j[[#]] < max[[#]],
		 j[[#]] = j[[#]]+1;  Return[],
		 j[[#]] = origin])&,
	     	 Reverse[Range[Length[i]]] ];
	j
 ]

(*==== Tree manipulation from Tree.m inlined here ====*)

MakeTree::usage =
"MakeTree[list] creates a binary tree with each node labeled
by an element in list."

TreeFind::usage =
"TreeFind[treelist, x] finds the largest element smaller than or equal to x in
the list from which treelist was constructed."

MakeTree[{}] := {}

MakeTree[list_List] :=
   Block[{n, t},
		n = Length[list];
		t = Transpose[{Sort[list], Range[n]}] ;
		MakeTree0[ 1, n ]
   ]

MakeTree0[i_,j_] := Block[{midpoint,diff},
	diff = j-i;
	Which[
	   diff==3, {t[[i+1]],{t[[i]],{},{}},{t[[i+2]],{},{t[[i+3]],{},{}}}},
	   diff==2, {t[[i+1]],{t[[i]],{},{}},{t[[j]],{},{}}},
	   diff==1, {t[[i]],{},{t[[j]],{},{}}},
	   diff==0, {t[[i]],{},{}},
	   True, (
			midpoint = i + Quotient[diff,2];
			{t[[midpoint]], 
				MakeTree0[i,midpoint-1],
				MakeTree0[midpoint+1,j]}
		 )
	   ]]

TreeFind[{}, e_] := 0

TreeFind[tree_List, e_] :=
	 Block[{found=0, bar=e},
        	TreeFind0[tree];
	        found
	 ]

TreeFind0[tree_] :=
        Block[{m, k},
        {m, k} = First[tree] ;
        Which[
              bar < m, TreeFind0[tree[[2]]],
              bar > m, found = k ;TreeFind0[tree[[3]]],
              True, found = k; Return[]
        ]]

TreeFind0[{}] = 1

(*============== end Tree ========== *)


End[ ]

SetAttributes[ Column ,ReadProtected];
SetAttributes[ ColumnTake ,ReadProtected];
SetAttributes[ ColumnDrop ,ReadProtected];
SetAttributes[ ColumnJoin ,ReadProtected];
SetAttributes[ RowJoin ,ReadProtected];
SetAttributes[ DropNonNumeric ,ReadProtected];
SetAttributes[ DropNonNumericColumn ,ReadProtected];
SetAttributes[ BooleanSelect, ReadProtected];
SetAttributes[ TakeWhile, ReadProtected];
SetAttributes[LengthWhile , ReadProtected];
SetAttributes[ Frequencies, ReadProtected];
SetAttributes[ QuantileForm, ReadProtected];
SetAttributes[ CumulativeSums, ReadProtected];
SetAttributes[BinCounts , ReadProtected];
SetAttributes[RangeCounts , ReadProtected];
SetAttributes[CategoryCounts , ReadProtected];
SetAttributes[RangeLists , ReadProtected];
SetAttributes[ BinLists, ReadProtected];
SetAttributes[CategoryLists , ReadProtected];

Protect[Column,ColumnTake,ColumnDrop,ColumnJoin,RowJoin,DropNonNumeric,
DropNonNumericColumn,BooleanSelect,TakeWhile,LengthWhile,
Frequencies,QuantileForm,CumulativeSums,BinCounts,RangeCounts,
CategoryCounts,RangeLists,BinLists,CategoryLists];

EndPackage[ ]


