Monday, March 14, 2011

Find the distribution of data points using method of moments

For my third object oriented programming module, we were given multiple sets of data and told to find the probability distributions that best fits them.
Then I chose to use the method of moments, but given the knowledge I have now, maybe maximum likelihood would have fared better.

The candidate distributions are:
Uniform
Rayleigh
Cauchy
Exponential
Gaussian
Logistic
Weibull

The code here:
/***************Assignment 2 OOP1, author: Chee Tji Hun***********/
#include <iostream>
#include <fstream>
#include <cmath>
#define _USE_MATH_DEFINES
#include <math.h>
#include <vector>
#include <string>
#include <sstream>
using namespace std;

/***************Input/output methods***********/
//return parameters as string in terms of a,b,c
string showParams(vector<double> params){
char paramName = 'a';
string returnString;
//print in order of a,b,c stop when params size met
for(unsigned int i=0;i<params.size();i++){
string s_ParamName(1,char(paramName+i));
stringstream s;
s<<params[i];
returnString+=" "+s_ParamName+"="+s.str()+",";
}
return returnString;
}

//read the input files and returns the data points
void readInputFiles(vector<double> &datapoints, string filename){
datapoints.clear();
datapoints.resize(1000);
//set vector elements to 0
for(unsigned int i=0; i<datapoints.size(); i++){datapoints[i]=0;}
double temp=0;
int index = 0;
ifstream in(filename.c_str());
//read double into the vector, find its bin and increment bin count
while(in>>temp){
index = int(temp);
datapoints[index]++;
}
}

/***************Discriptors of the data points***********/
//return total count of datapoints
double getTotalObs(vector<double> &datapoints){
double totalObs=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
totalObs+=datapoints[i];
}
return totalObs;
}

//return the mean of the datapoints
double getMean(const vector<double> &datapoints){
double numOfOccurence=0;
double total=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
total +=(i*datapoints[i]);
numOfOccurence+=datapoints[i];
}
return total/numOfOccurence;
}

//return the variance of the datapoints
double getVariance(const vector<double> &datapoints, double mean){
double numOfOccurence=0;
double total=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
total +=(i*i*datapoints[i]);
numOfOccurence+=datapoints[i];
}
return total/numOfOccurence - pow(mean,2.0);
}

//return the mode of the datapoints
double getMode(const vector<double> &datapoints){
double biggestBin=-1;
double mode=-1.0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]>biggestBin){
biggestBin=datapoints[i];
mode=double(i);
}
}
return mode;
}

//return the median of the datapoints
double getMedian(const vector<double> &datapoints, double totalObs){
double halfWayPoint = totalObs/2.0;
double median = -1.0;

for(unsigned int i=0;i<datapoints.size();i++){
halfWayPoint-=datapoints[i];
if(halfWayPoint<=0){
median=double(i);
break;
}
}
return median;
}

/***************Various distributions and their respective a,b,c estimation methods***********/
/***************ALL the parameters are estimated using method of moments***********/
/*********************Uniform********************/
void getUniformParam(double variance, double totalObs, vector<double> &param, vector<double> &step){
param.clear();
param.resize(1);
param[0] = (1.0/sqrt(variance*12)) * totalObs; //param athis is the scaling factor, since not normalized

step.clear();
step.resize(1);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
}

double uniform(double x, const vector<double> &param){
double a = param[0];
return a;
}

/*********************Rayleigh********************/
void getRayleighParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){
double sigma1 = mean*sqrt((2.0/M_PI)); //traditional rayleigh formula contains sigma which can be arrived in two ways, take average
double sigma2= sqrt(variance*(2.0/(4-M_PI)));
double avgSigma = (sigma1+sigma2)/2.0;
param.clear();
param.resize(2);
param[0] = (1/(avgSigma*avgSigma))*totalObs; //param a, rayleigh formula with scaling factor
param[1] = 2*avgSigma*avgSigma;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}

double rayleigh(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
return a*x*pow(M_E,-(pow(x,2.0)/b));
}

/*********************Exponential********************/
void getExponentialParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){
double lambda1 = 1.0/mean;//traditional Exponential formula contains lambda which can be arrived in two ways, take average
double lambda2 = sqrt(1.0/variance);
double avgLambda = (lambda1+lambda2)/2.0;
param.clear();
param.resize(2);
param[0] = avgLambda*totalObs; // param a, exponential formula with scaling factor
param[1] = 1.0/avgLambda;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}

double exponential(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
return a*pow(M_E,-x/b);
}

/*********************Cauchy********************/
void getCauchyParam(double mode, double median, double totalObs, vector<double> &param, vector<double> &step){
param.clear();
param.resize(2);
param[0] = (1.0/M_PI)*totalObs; // param a
param[1] = (mode+median)/2;// param b

step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment
step[1]=param[1]/10.0;//param b delta increment
}

double cauchy(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
return a/(1+pow(x-b,2.0));
}

/*********************Gaussian********************/
void getGaussianParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){
double stdDev = sqrt(variance);//the famous gaussian,estimated with mean and variance
param.clear();
param.resize(3);
param[0] = (1.0/(sqrt(2.0*M_PI)*stdDev)) * totalObs; // param a
param[1] = mean;// param b
param[2] = 2*variance;// param c
step.clear();
step.resize(3);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
step[2]=param[2]/10.0;//param c delta increment
}

double gaussian(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
double c = param[2];
return a*pow(M_E,-(pow(x-b,2)/c));
}

/*********************Logistic********************/
void getLogisticParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){
double s = sqrt((3*variance)/pow(M_PI,2.0));//s is derived from the traditional logistic formula
param.clear();
param.resize(2);
param[0] = (1.0/s)*totalObs; // param a
param[1] = s;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}

double logistic(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
return a*pow(M_E,-x/b)/(pow(1+pow(M_E,-x/b),2));
}

/*********************Weibull********************/
void getWeibullParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){
param.clear();
param.resize(3);
//found using data plot on excel, data looks like a weibull or rayleigh,
//but rayleigh can never meet goodness of fit criteria because of its constraints
param[0]=18; //param a
param[1]=1.5; //param b
param[2]=400; //param c

step.clear();
step.resize(3);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
step[2]=param[2]/10.0;//param c delta increment
}

double weibull(double x, const vector<double> &param){
double a = param[0];
double b = param[1];
double c = param[2];
double result=a*pow(x, b-1.0)*pow(M_E,-pow(x/c,b));
return result;
}

/***************calculations and algorithm for finding the min Mean square error***********/
//calculate the mean square error for all datapoints
double meanSquareError(double (*dist)(double,const vector<double>&),const vector<double> &datapoints,vector<double> &params){
double sum = 0.0;
//for each datapoint, run the get mean square error and sum it up, potentially sum could overflow,
//I catch this error in the calling method,
//alternatively we could have implemented a 128 bit struct{double double bool} to contain sum and indicate if it overflows
for(unsigned int i=0; i<datapoints.size(); i++){
double temp = pow((dist(i,params) - datapoints[i]), 2.0);
sum += temp;
}
return sum;
}

//get the nearest neighbours of the parameters
void getPaths(vector<vector<double> > &paths, vector<double> params, vector<double> steps){
int numOfParam = params.size();//number of axis
int numOfNeighbours = int(pow(3.0, numOfParam));//num of neighbours inclusive of current location
vector<double> tempSteps = steps;
//for each neighbour
for(int i=0;i<numOfNeighbours;i++){
//compute the its location with regards to each axis
for(int j=0;j<numOfParam; j++){
int base = int(pow(3.0,j));
int baseHigh = int(pow(3.0,j+1));
tempSteps[j] = (((i%baseHigh)/base)-1)*steps[j];//scale it down to (-1,0,1) multiplied by its search granularity
paths[i][j] = params[j]+tempSteps[j];
}
}
}

//optimize the params to find the min mse, return the optimized parameters (params) and the corresponding mean square error (bestMse)
void findBest(double (*distribution)(double,const vector<double>&), const vector<double> &datapoints
, double &bestMse, vector<double> &params, vector<double> steps, int stepDownMax){
vector<vector<double> > paths;//neighbours of current param incl. of itself
paths.resize(int(pow(3.0,double(params.size()))));//num of paths (eg: param = a,b,c paths.size() = 27, param=a,b paths.size()=9)
for(unsigned int i=0;i<paths.size();i++){
paths[i] = vector<double>(params.size(),0.0);
}

int searchCount=0;
int maxSearch=1000; //do not search for best move more than this count
int stepDownCount = 0;
bool isStopSearch=0; //stop search flag
vector<double> bestMove = params;
bestMse=0; //the lowest mean square error achieved by the best move
int noMoveIndex=(paths.size()-1)/2; //current location index
while (!isStopSearch){
getPaths(paths,params,steps);//find all the neighbours in solution space
//for each path find the mse, store the best move
for(unsigned int i=0;i<paths.size();i++){
double mse=meanSquareError(distribution,datapoints,paths[i]);
if(i==0) bestMse=mse;
else if(mse<bestMse){
bestMove=paths[i]; // this is the best move
bestMse=mse; //this is the best mean square error
}
}
//if this current location is the best, then step down and search with increased granularity
if(bestMove==paths[noMoveIndex]){
for(unsigned int i=0;i<steps.size();i++){ steps[i]/=10;}
stepDownCount++;
}
params=bestMove; //repeat search with newest best move as the start point
searchCount++;

//stop search conditions
if(stepDownCount>=stepDownMax){isStopSearch = 1;}//max granularity reached
if(bestMse<=0.0){isStopSearch = 1;}//best mse overflowed, try some other path
if(searchCount>=maxSearch) {isStopSearch = 1;} //max amount of searches exceeded
}
}

int main(){
vector<string> filenames;
filenames.push_back("Data2.txt");
filenames.push_back("Data3.txt");
filenames.push_back("Data4.txt");
filenames.push_back("Data5.txt");
filenames.push_back("Data6.txt");
filenames.push_back("Data7.txt");
vector<double> datapoints, params,steps;
double mseTolerance = 1e5;
//for each data file
for (unsigned int i=0; i< filenames.size(); i++){
string datafile = filenames[i];

readInputFiles(datapoints, datafile);

//get the mean, variance, total observations, mode and median of the data points,
//could have consolidated all the discriptors into one method and thus one for loop
double totalObs = getTotalObs(datapoints); //find number of data points
double mean = getMean(datapoints);//find mean of data points
double variance = getVariance(datapoints, mean);//find variance of data points
double mode = getMode(datapoints);//find mode of data points
double median = getMedian(datapoints, totalObs);//find median of data points
int maxStepDown = 3;//increase granularity of search count
double bestMse;//lowest mean square error for a particular method
double (*distribution)(double, const vector<double>&); //function pointer to various distributions

/*test each distribution from least time consuming to most time consuming; ascending order of params*/
//try uniform distribuion
distribution = uniform;
getUniformParam(variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown); //find the best mse
if(bestMse<=mseTolerance){
cout<<datafile<<": Uniform distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try exponential distribution
distribution = exponential;
getExponentialParam(mean,variance,totalObs, params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Exponential distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try Rayleigh distribution
distribution = rayleigh;
getRayleighParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Rayleigh distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try Logistic distribution
distribution = logistic;
getLogisticParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Logistic distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try Cauchy distribution
distribution = cauchy;
getCauchyParam(mode,median,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Cauchy distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try Gaussian distribution
distribution = gaussian;
getGaussianParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Gaussian distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//try Weibull distribution
distribution = weibull;
getWeibullParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Weibull distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}

//did not find any distribution that fits
cout<<datafile<<": no distribution found :("<<endl;
}//next file
return 0;
}

No comments:

Post a Comment