Chee Tji Hun: Find the distribution of data points using method of moments

For my third object oriented programming module, we were given multiple sets of data and told to find the probability distributions that best fits them.

Then I chose to use the method of moments, but given the knowledge I have now, maybe maximum likelihood would have fared better.

The candidate distributions are:

Uniform

Rayleigh

Cauchy

Exponential

Gaussian

Logistic

Weibull

The code here:

/***************Assignment 2 OOP1, author: Chee Tji Hun***********/

#include <iostream>

#include <fstream>

#include <cmath>

#define _USE_MATH_DEFINES

#include <math.h>

#include <vector>

#include <string>

#include <sstream>

using namespace std;

/***************Input/output methods***********/

//return parameters as string in terms of a,b,c

string showParams(vector<double> params){

char paramName = 'a';

string returnString;

//print in order of a,b,c stop when params size met

for(unsigned int i=0;i<params.size();i++){

string s_ParamName(1,char(paramName+i));

stringstream s;

s<<params[i];

returnString+=" "+s_ParamName+"="+s.str()+",";

}

return returnString;

}

//read the input files and returns the data points

void readInputFiles(vector<double> &datapoints, string filename){

datapoints.clear();

datapoints.resize(1000);

//set vector elements to 0

for(unsigned int i=0; i<datapoints.size(); i++){datapoints[i]=0;}

double temp=0;

int index = 0;

ifstream in(filename.c_str());

//read double into the vector, find its bin and increment bin count

while(in>>temp){

index = int(temp);

datapoints[index]++;

}

/***************Discriptors of the data points***********/

//return total count of datapoints

double getTotalObs(vector<double> &datapoints){

double totalObs=0;

for(unsigned int i=0;i<datapoints.size();i++){

if(datapoints[i]==0){continue;}

totalObs+=datapoints[i];

}

return totalObs;

}

//return the mean of the datapoints

double getMean(const vector<double> &datapoints){

double numOfOccurence=0;

double total=0;

for(unsigned int i=0;i<datapoints.size();i++){

if(datapoints[i]==0){continue;}

total +=(i*datapoints[i]);

numOfOccurence+=datapoints[i];

}

return total/numOfOccurence;

}

//return the variance of the datapoints

double getVariance(const vector<double> &datapoints, double mean){

double numOfOccurence=0;

double total=0;

for(unsigned int i=0;i<datapoints.size();i++){

if(datapoints[i]==0){continue;}

total +=(i*i*datapoints[i]);

numOfOccurence+=datapoints[i];

}

return total/numOfOccurence - pow(mean,2.0);

}

//return the mode of the datapoints

double getMode(const vector<double> &datapoints){

double biggestBin=-1;

double mode=-1.0;

for(unsigned int i=0;i<datapoints.size();i++){

if(datapoints[i]>biggestBin){

biggestBin=datapoints[i];

mode=double(i);

}

return mode;

}

//return the median of the datapoints

double getMedian(const vector<double> &datapoints, double totalObs){

double halfWayPoint = totalObs/2.0;

double median = -1.0;

for(unsigned int i=0;i<datapoints.size();i++){

halfWayPoint-=datapoints[i];

if(halfWayPoint<=0){

median=double(i);

break;

}

return median;

}

/***************Various distributions and their respective a,b,c estimation methods***********/

/***************ALL the parameters are estimated using method of moments***********/

/*********************Uniform********************/

void getUniformParam(double variance, double totalObs, vector<double> &param, vector<double> &step){

param.clear();

param.resize(1);

param[0] = (1.0/sqrt(variance*12)) * totalObs; //param athis is the scaling factor, since not normalized

step.clear();

step.resize(1);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

}

double uniform(double x, const vector<double> &param){

double a = param[0];

return a;

}

/*********************Rayleigh********************/

void getRayleighParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){

double sigma1 = mean*sqrt((2.0/M_PI)); //traditional rayleigh formula contains sigma which can be arrived in two ways, take average

double sigma2= sqrt(variance*(2.0/(4-M_PI)));

double avgSigma = (sigma1+sigma2)/2.0;

param.clear();

param.resize(2);

param[0] = (1/(avgSigma*avgSigma))*totalObs; //param a, rayleigh formula with scaling factor

param[1] = 2*avgSigma*avgSigma;// param b

step.clear();

step.resize(2);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

step[1]=param[1]/10.0;//param b delta increment

}

double rayleigh(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

return a*x*pow(M_E,-(pow(x,2.0)/b));

}

/*********************Exponential********************/

void getExponentialParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){

double lambda1 = 1.0/mean;//traditional Exponential formula contains lambda which can be arrived in two ways, take average

double lambda2 = sqrt(1.0/variance);

double avgLambda = (lambda1+lambda2)/2.0;

param.clear();

param.resize(2);

param[0] = avgLambda*totalObs; // param a, exponential formula with scaling factor

param[1] = 1.0/avgLambda;// param b

step.clear();

step.resize(2);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

step[1]=param[1]/10.0;//param b delta increment

}

double exponential(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

return a*pow(M_E,-x/b);

}

/*********************Cauchy********************/

void getCauchyParam(double mode, double median, double totalObs, vector<double> &param, vector<double> &step){

param.clear();

param.resize(2);

param[0] = (1.0/M_PI)*totalObs; // param a

param[1] = (mode+median)/2;// param b

step.clear();

step.resize(2);

step[0]=param[0]/10.0;//param a delta increment

step[1]=param[1]/10.0;//param b delta increment

}

double cauchy(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

return a/(1+pow(x-b,2.0));

}

/*********************Gaussian********************/

void getGaussianParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){

double stdDev = sqrt(variance);//the famous gaussian,estimated with mean and variance

param.clear();

param.resize(3);

param[0] = (1.0/(sqrt(2.0*M_PI)*stdDev)) * totalObs; // param a

param[1] = mean;// param b

param[2] = 2*variance;// param c

step.clear();

step.resize(3);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

step[1]=param[1]/10.0;//param b delta increment

step[2]=param[2]/10.0;//param c delta increment

}

double gaussian(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

double c = param[2];

return a*pow(M_E,-(pow(x-b,2)/c));

}

/*********************Logistic********************/

void getLogisticParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){

double s = sqrt((3*variance)/pow(M_PI,2.0));//s is derived from the traditional logistic formula

param.clear();

param.resize(2);

param[0] = (1.0/s)*totalObs; // param a

param[1] = s;// param b

step.clear();

step.resize(2);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

step[1]=param[1]/10.0;//param b delta increment

}

double logistic(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

return a*pow(M_E,-x/b)/(pow(1+pow(M_E,-x/b),2));

}

/*********************Weibull********************/

void getWeibullParam(double mean, double variance, double totalObs, vector<double> &param, vector<double> &step){

param.clear();

param.resize(3);

//found using data plot on excel, data looks like a weibull or rayleigh,

//but rayleigh can never meet goodness of fit criteria because of its constraints

param[0]=18; //param a

param[1]=1.5; //param b

param[2]=400; //param c

step.clear();

step.resize(3);

step[0]=param[0]/10.0;//param a delta increment, initial search granularity

step[1]=param[1]/10.0;//param b delta increment

step[2]=param[2]/10.0;//param c delta increment

}

double weibull(double x, const vector<double> &param){

double a = param[0];

double b = param[1];

double c = param[2];

double result=a*pow(x, b-1.0)*pow(M_E,-pow(x/c,b));

return result;

}

/***************calculations and algorithm for finding the min Mean square error***********/

//calculate the mean square error for all datapoints

double meanSquareError(double (*dist)(double,const vector<double>&),const vector<double> &datapoints,vector<double> &params){

double sum = 0.0;

//for each datapoint, run the get mean square error and sum it up, potentially sum could overflow,

//I catch this error in the calling method,

//alternatively we could have implemented a 128 bit struct{double double bool} to contain sum and indicate if it overflows

for(unsigned int i=0; i<datapoints.size(); i++){

double temp = pow((dist(i,params) - datapoints[i]), 2.0);

sum += temp;

}

return sum;

}

//get the nearest neighbours of the parameters

void getPaths(vector<vector<double> > &paths, vector<double> params, vector<double> steps){

int numOfParam = params.size();//number of axis

int numOfNeighbours = int(pow(3.0, numOfParam));//num of neighbours inclusive of current location

vector<double> tempSteps = steps;

//for each neighbour

for(int i=0;i<numOfNeighbours;i++){

//compute the its location with regards to each axis

for(int j=0;j<numOfParam; j++){

int base = int(pow(3.0,j));

int baseHigh = int(pow(3.0,j+1));

tempSteps[j] = (((i%baseHigh)/base)-1)*steps[j];//scale it down to (-1,0,1) multiplied by its search granularity

paths[i][j] = params[j]+tempSteps[j];

}

//optimize the params to find the min mse, return the optimized parameters (params) and the corresponding mean square error (bestMse)

void findBest(double (*distribution)(double,const vector<double>&), const vector<double> &datapoints

, double &bestMse, vector<double> &params, vector<double> steps, int stepDownMax){

vector<vector<double> > paths;//neighbours of current param incl. of itself

paths.resize(int(pow(3.0,double(params.size()))));//num of paths (eg: param = a,b,c paths.size() = 27, param=a,b paths.size()=9)

for(unsigned int i=0;i<paths.size();i++){

paths[i] = vector<double>(params.size(),0.0);

}

int searchCount=0;

int maxSearch=1000; //do not search for best move more than this count

int stepDownCount = 0;

bool isStopSearch=0; //stop search flag

vector<double> bestMove = params;

bestMse=0; //the lowest mean square error achieved by the best move

int noMoveIndex=(paths.size()-1)/2; //current location index

while (!isStopSearch){

getPaths(paths,params,steps);//find all the neighbours in solution space

//for each path find the mse, store the best move

for(unsigned int i=0;i<paths.size();i++){

double mse=meanSquareError(distribution,datapoints,paths[i]);

if(i==0) bestMse=mse;

else if(mse<bestMse){

bestMove=paths[i]; // this is the best move

bestMse=mse; //this is the best mean square error

}

//if this current location is the best, then step down and search with increased granularity

if(bestMove==paths[noMoveIndex]){

for(unsigned int i=0;i<steps.size();i++){ steps[i]/=10;}

stepDownCount++;

}

params=bestMove; //repeat search with newest best move as the start point

searchCount++;

//stop search conditions

if(stepDownCount>=stepDownMax){isStopSearch = 1;}//max granularity reached

if(bestMse<=0.0){isStopSearch = 1;}//best mse overflowed, try some other path

if(searchCount>=maxSearch) {isStopSearch = 1;} //max amount of searches exceeded

}

int main(){

vector<string> filenames;

filenames.push_back("Data2.txt");

filenames.push_back("Data3.txt");

filenames.push_back("Data4.txt");

filenames.push_back("Data5.txt");

filenames.push_back("Data6.txt");

filenames.push_back("Data7.txt");

vector<double> datapoints, params,steps;

double mseTolerance = 1e5;

//for each data file

for (unsigned int i=0; i< filenames.size(); i++){

string datafile = filenames[i];

readInputFiles(datapoints, datafile);

//get the mean, variance, total observations, mode and median of the data points,

//could have consolidated all the discriptors into one method and thus one for loop

double totalObs = getTotalObs(datapoints); //find number of data points

double mean = getMean(datapoints);//find mean of data points

double variance = getVariance(datapoints, mean);//find variance of data points

double mode = getMode(datapoints);//find mode of data points

double median = getMedian(datapoints, totalObs);//find median of data points

int maxStepDown = 3;//increase granularity of search count

double bestMse;//lowest mean square error for a particular method

double (*distribution)(double, const vector<double>&); //function pointer to various distributions

/*test each distribution from least time consuming to most time consuming; ascending order of params*/

//try uniform distribuion

distribution = uniform;

getUniformParam(variance,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown); //find the best mse

if(bestMse<=mseTolerance){

cout<<datafile<<": Uniform distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try exponential distribution

distribution = exponential;

getExponentialParam(mean,variance,totalObs, params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Exponential distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try Rayleigh distribution

distribution = rayleigh;

getRayleighParam(mean,variance,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Rayleigh distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try Logistic distribution

distribution = logistic;

getLogisticParam(mean,variance,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Logistic distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try Cauchy distribution

distribution = cauchy;

getCauchyParam(mode,median,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Cauchy distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try Gaussian distribution

distribution = gaussian;

getGaussianParam(mean,variance,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Gaussian distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//try Weibull distribution

distribution = weibull;

getWeibullParam(mean,variance,totalObs,params,steps);

findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);

if(bestMse<=mseTolerance){

cout<<datafile<<": Weibull distribution with"<<showParams(params)<<" S="<<bestMse<<endl;

continue;

}

//did not find any distribution that fits

cout<<datafile<<": no distribution found :("<<endl;

}//next file

return 0;

}

Chee Tji Hun

Labels

Monday, March 14, 2011

Find the distribution of data points using method of moments

No comments:

Post a Comment

About Me

Blog Archive