Then I chose to use the method of moments, but given the knowledge I have now, maybe maximum likelihood would have fared better.
The candidate distributions are:
Uniform
Rayleigh
Cauchy
Exponential
Gaussian
Logistic
Weibull
The code here:
/***************Assignment 2 OOP1, author: Chee Tji Hun***********/
#include <iostream>
#include <fstream>
#include <cmath>
#define _USE_MATH_DEFINES
#include <math.h>
#include <vector>
#include <string>
#include <sstream>
using namespace std;
/***************Input/output methods***********/
//return parameters as string in terms of a,b,c
string showParams(vector<double> params){
char paramName = 'a';
string returnString;
//print in order of a,b,c stop when params size met
for(unsigned int i=0;i<params.size();i++){
string s_ParamName(1,char(paramName+i));
stringstream s;
s<<params[i];
returnString+=" "+s_ParamName+"="+s.str()+",";
}
return returnString;
}
//read the input files and returns the data points
void readInputFiles(vector<double> &datapoints, string filename){
datapoints.clear();
datapoints.resize(1000);
//set vector elements to 0
for(unsigned int i=0; i<datapoints.size(); i++){datapoints[i]=0;}
double temp=0;
int index = 0;
ifstream in(filename.c_str());
//read double into the vector, find its bin and increment bin count
while(in>>temp){
index = int(temp);
datapoints[index]++;
}
}
/***************Discriptors of the data points***********/
//return total count of datapoints
double getTotalObs(vector<double> &datapoints){
double totalObs=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
totalObs+=datapoints[i];
}
return totalObs;
}
//return the mean of the datapoints
double getMean(const vector<double> &datapoints){
double numOfOccurence=0;
double total=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
total +=(i*datapoints[i]);
numOfOccurence+=datapoints[i];
}
return total/numOfOccurence;
}
//return the variance of the datapoints
double getVariance(const vector<double> &datapoints, double mean){
double numOfOccurence=0;
double total=0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]==0){continue;}
total +=(i*i*datapoints[i]);
numOfOccurence+=datapoints[i];
}
return total/numOfOccurence - pow(mean,2.0);
}
//return the mode of the datapoints
double getMode(const vector<double> &datapoints){
double biggestBin=-1;
double mode=-1.0;
for(unsigned int i=0;i<datapoints.size();i++){
if(datapoints[i]>biggestBin){
biggestBin=datapoints[i];
mode=double(i);
}
}
return mode;
}
//return the median of the datapoints
double getMedian(const vector<double> &datapoints, double totalObs){
double halfWayPoint = totalObs/2.0;
double median = -1.0;
for(unsigned int i=0;i<datapoints.size();i++){
halfWayPoint-=datapoints[i];
if(halfWayPoint<=0){
median=double(i);
break;
}
}
return median;
}
/***************Various distributions and their respective a,b,c estimation methods***********/
/***************ALL the parameters are estimated using method of moments***********/
/*********************Uniform********************/
void getUniformParam(double variance, double totalObs, vector<double> ¶m, vector<double> &step){
param.clear();
param.resize(1);
param[0] = (1.0/sqrt(variance*12)) * totalObs; //param athis is the scaling factor, since not normalized
step.clear();
step.resize(1);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
}
double uniform(double x, const vector<double> ¶m){
double a = param[0];
return a;
}
/*********************Rayleigh********************/
void getRayleighParam(double mean, double variance, double totalObs, vector<double> ¶m, vector<double> &step){
double sigma1 = mean*sqrt((2.0/M_PI)); //traditional rayleigh formula contains sigma which can be arrived in two ways, take average
double sigma2= sqrt(variance*(2.0/(4-M_PI)));
double avgSigma = (sigma1+sigma2)/2.0;
param.clear();
param.resize(2);
param[0] = (1/(avgSigma*avgSigma))*totalObs; //param a, rayleigh formula with scaling factor
param[1] = 2*avgSigma*avgSigma;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}
double rayleigh(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
return a*x*pow(M_E,-(pow(x,2.0)/b));
}
/*********************Exponential********************/
void getExponentialParam(double mean, double variance, double totalObs, vector<double> ¶m, vector<double> &step){
double lambda1 = 1.0/mean;//traditional Exponential formula contains lambda which can be arrived in two ways, take average
double lambda2 = sqrt(1.0/variance);
double avgLambda = (lambda1+lambda2)/2.0;
param.clear();
param.resize(2);
param[0] = avgLambda*totalObs; // param a, exponential formula with scaling factor
param[1] = 1.0/avgLambda;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}
double exponential(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
return a*pow(M_E,-x/b);
}
/*********************Cauchy********************/
void getCauchyParam(double mode, double median, double totalObs, vector<double> ¶m, vector<double> &step){
param.clear();
param.resize(2);
param[0] = (1.0/M_PI)*totalObs; // param a
param[1] = (mode+median)/2;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment
step[1]=param[1]/10.0;//param b delta increment
}
double cauchy(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
return a/(1+pow(x-b,2.0));
}
/*********************Gaussian********************/
void getGaussianParam(double mean, double variance, double totalObs, vector<double> ¶m, vector<double> &step){
double stdDev = sqrt(variance);//the famous gaussian,estimated with mean and variance
param.clear();
param.resize(3);
param[0] = (1.0/(sqrt(2.0*M_PI)*stdDev)) * totalObs; // param a
param[1] = mean;// param b
param[2] = 2*variance;// param c
step.clear();
step.resize(3);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
step[2]=param[2]/10.0;//param c delta increment
}
double gaussian(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
double c = param[2];
return a*pow(M_E,-(pow(x-b,2)/c));
}
/*********************Logistic********************/
void getLogisticParam(double mean, double variance, double totalObs, vector<double> ¶m, vector<double> &step){
double s = sqrt((3*variance)/pow(M_PI,2.0));//s is derived from the traditional logistic formula
param.clear();
param.resize(2);
param[0] = (1.0/s)*totalObs; // param a
param[1] = s;// param b
step.clear();
step.resize(2);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
}
double logistic(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
return a*pow(M_E,-x/b)/(pow(1+pow(M_E,-x/b),2));
}
/*********************Weibull********************/
void getWeibullParam(double mean, double variance, double totalObs, vector<double> ¶m, vector<double> &step){
param.clear();
param.resize(3);
//found using data plot on excel, data looks like a weibull or rayleigh,
//but rayleigh can never meet goodness of fit criteria because of its constraints
param[0]=18; //param a
param[1]=1.5; //param b
param[2]=400; //param c
step.clear();
step.resize(3);
step[0]=param[0]/10.0;//param a delta increment, initial search granularity
step[1]=param[1]/10.0;//param b delta increment
step[2]=param[2]/10.0;//param c delta increment
}
double weibull(double x, const vector<double> ¶m){
double a = param[0];
double b = param[1];
double c = param[2];
double result=a*pow(x, b-1.0)*pow(M_E,-pow(x/c,b));
return result;
}
/***************calculations and algorithm for finding the min Mean square error***********/
//calculate the mean square error for all datapoints
double meanSquareError(double (*dist)(double,const vector<double>&),const vector<double> &datapoints,vector<double> ¶ms){
double sum = 0.0;
//for each datapoint, run the get mean square error and sum it up, potentially sum could overflow,
//I catch this error in the calling method,
//alternatively we could have implemented a 128 bit struct{double double bool} to contain sum and indicate if it overflows
for(unsigned int i=0; i<datapoints.size(); i++){
double temp = pow((dist(i,params) - datapoints[i]), 2.0);
sum += temp;
}
return sum;
}
//get the nearest neighbours of the parameters
void getPaths(vector<vector<double> > &paths, vector<double> params, vector<double> steps){
int numOfParam = params.size();//number of axis
int numOfNeighbours = int(pow(3.0, numOfParam));//num of neighbours inclusive of current location
vector<double> tempSteps = steps;
//for each neighbour
for(int i=0;i<numOfNeighbours;i++){
//compute the its location with regards to each axis
for(int j=0;j<numOfParam; j++){
int base = int(pow(3.0,j));
int baseHigh = int(pow(3.0,j+1));
tempSteps[j] = (((i%baseHigh)/base)-1)*steps[j];//scale it down to (-1,0,1) multiplied by its search granularity
paths[i][j] = params[j]+tempSteps[j];
}
}
}
//optimize the params to find the min mse, return the optimized parameters (params) and the corresponding mean square error (bestMse)
void findBest(double (*distribution)(double,const vector<double>&), const vector<double> &datapoints
, double &bestMse, vector<double> ¶ms, vector<double> steps, int stepDownMax){
vector<vector<double> > paths;//neighbours of current param incl. of itself
paths.resize(int(pow(3.0,double(params.size()))));//num of paths (eg: param = a,b,c paths.size() = 27, param=a,b paths.size()=9)
for(unsigned int i=0;i<paths.size();i++){
paths[i] = vector<double>(params.size(),0.0);
}
int searchCount=0;
int maxSearch=1000; //do not search for best move more than this count
int stepDownCount = 0;
bool isStopSearch=0; //stop search flag
vector<double> bestMove = params;
bestMse=0; //the lowest mean square error achieved by the best move
int noMoveIndex=(paths.size()-1)/2; //current location index
while (!isStopSearch){
getPaths(paths,params,steps);//find all the neighbours in solution space
//for each path find the mse, store the best move
for(unsigned int i=0;i<paths.size();i++){
double mse=meanSquareError(distribution,datapoints,paths[i]);
if(i==0) bestMse=mse;
else if(mse<bestMse){
bestMove=paths[i]; // this is the best move
bestMse=mse; //this is the best mean square error
}
}
//if this current location is the best, then step down and search with increased granularity
if(bestMove==paths[noMoveIndex]){
for(unsigned int i=0;i<steps.size();i++){ steps[i]/=10;}
stepDownCount++;
}
params=bestMove; //repeat search with newest best move as the start point
searchCount++;
//stop search conditions
if(stepDownCount>=stepDownMax){isStopSearch = 1;}//max granularity reached
if(bestMse<=0.0){isStopSearch = 1;}//best mse overflowed, try some other path
if(searchCount>=maxSearch) {isStopSearch = 1;} //max amount of searches exceeded
}
}
int main(){
vector<string> filenames;
filenames.push_back("Data2.txt");
filenames.push_back("Data3.txt");
filenames.push_back("Data4.txt");
filenames.push_back("Data5.txt");
filenames.push_back("Data6.txt");
filenames.push_back("Data7.txt");
vector<double> datapoints, params,steps;
double mseTolerance = 1e5;
//for each data file
for (unsigned int i=0; i< filenames.size(); i++){
string datafile = filenames[i];
readInputFiles(datapoints, datafile);
//get the mean, variance, total observations, mode and median of the data points,
//could have consolidated all the discriptors into one method and thus one for loop
double totalObs = getTotalObs(datapoints); //find number of data points
double mean = getMean(datapoints);//find mean of data points
double variance = getVariance(datapoints, mean);//find variance of data points
double mode = getMode(datapoints);//find mode of data points
double median = getMedian(datapoints, totalObs);//find median of data points
int maxStepDown = 3;//increase granularity of search count
double bestMse;//lowest mean square error for a particular method
double (*distribution)(double, const vector<double>&); //function pointer to various distributions
/*test each distribution from least time consuming to most time consuming; ascending order of params*/
//try uniform distribuion
distribution = uniform;
getUniformParam(variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown); //find the best mse
if(bestMse<=mseTolerance){
cout<<datafile<<": Uniform distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try exponential distribution
distribution = exponential;
getExponentialParam(mean,variance,totalObs, params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Exponential distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try Rayleigh distribution
distribution = rayleigh;
getRayleighParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Rayleigh distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try Logistic distribution
distribution = logistic;
getLogisticParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Logistic distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try Cauchy distribution
distribution = cauchy;
getCauchyParam(mode,median,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Cauchy distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try Gaussian distribution
distribution = gaussian;
getGaussianParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Gaussian distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//try Weibull distribution
distribution = weibull;
getWeibullParam(mean,variance,totalObs,params,steps);
findBest((*distribution),datapoints,bestMse,params,steps,maxStepDown);
if(bestMse<=mseTolerance){
cout<<datafile<<": Weibull distribution with"<<showParams(params)<<" S="<<bestMse<<endl;
continue;
}
//did not find any distribution that fits
cout<<datafile<<": no distribution found :("<<endl;
}//next file
return 0;
}
No comments:
Post a Comment