Linear regression http://www.stat.yale.edu/Courses/1997-98/101/linreg.htm is often used in trying to generalize data, correlate data, and find outliers.
Something I had written some time ago in C# though it also includes VB.NET translation... It can easily be dropped into a datasethelper class and will regress two columns within a dataset returning major variables including Pearson's R http://davidmlane.com/hyperstat/A34739.html PLEASE NOTE THIS IS NOT PRODUCTION CODE AND IS HERE AS AN EXAMPLE.
public class Regression
{
public class RegressionProcessInfo {
public int SampleSize=0;
public double SigmaError;
public double XRangeL = double.MaxValue;
public double XRangeH = double.MinValue;
public double YRangeL = double.MaxValue;
public double YRangeH = double.MinValue;
public double StandardError;
public double a;
public double b;
public double XStdDev;
public double YStdDev;
public double XMean;
public double YMean;
public double PearsonsR;
public double t;
ArrayList Residuals = new ArrayList();
public string ToHTML() {
string ret = "";
ret += "y = " + a + " + " + b + "x" + "
";
ret += "Sample size = " + SampleSize + "
";
ret += "X Mean = " + XMean + "
";
ret += "X Range = " + XRangeL + " - " + XRangeH + "
";
ret += "X STDDEV = " + this.XStdDev + "
";
ret += "Y Mean = " + YMean + "
";
ret += "Y Range = " + YRangeL + " - " + YRangeH + "
";
ret += "Y STDDEV = " + this.YStdDev + "
";
ret += "t = " + this.t + "
";
ret += "r = " + this.PearsonsR + "
";
ret += "r² = " + this.PearsonsR * this.PearsonsR + "
";
if(this.PearsonsR * this.PearsonsR < .25) {
ret += "Low r² scores represent LOW correlation between variables !
";
}
ret += "
Total Error = " + this.SigmaError + "
";
ret += "Standard Error sqrt((y - y')² / n); = " + this.StandardError + "
";
if(this.StandardError > this.YStdDev) {
ret += "THIS IS NOT A REASONABLE PREDICTION AS THE STANDARD ERROR IS LARGER THAN THE FIRST STANDARD DEVIATION OF THE Y VARIABLE";
}
return ret;
}
}
public static RegressionProcessInfo Regress(DataSet ds, string XColumn, string YColumn) {
double sigmax = 0.0;
double sigmay = 0.0;
double sigmaxx = 0.0;
double sigmayy = 0.0;
double sigmaxy = 0.0;
double x;
double y;
double n = 0;
RegressionProcessInfo ret = new RegressionProcessInfo();
if(ds != null) {
foreach(DataRow dr in ds.Tables[0].Rows) {
x = double.Parse(dr[XColumn].ToString());
y = double.Parse(dr[YColumn].ToString());
if(x > ret.XRangeH) ret.XRangeH = x;
if(x < ret.XRangeL) ret.XRangeL = x;
if(y > ret.YRangeH) ret.YRangeH = y;
if(y < ret.YRangeL) ret.YRangeL = y;
sigmax += x;
sigmaxx += x * x;
sigmay += y;
sigmayy += y * y;
sigmaxy += x * y;
n++;
}
ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax);
ret.a = (sigmay - ret.b * sigmax) / n;
ret.SampleSize = (int) n;
foreach(DataRow dr in ds.Tables[0].Rows) {
//calculate distances for each point (residual)
y = double.Parse(dr[YColumn].ToString());
x = double.Parse(dr[XColumn].ToString());
double yprime = ret.a + ret.b * x; //prediction
double Residual = y - yprime;
ret.SigmaError += Residual * Residual;
}
ret.XMean = sigmax / n;
ret.YMean = sigmay / n;
ret.XStdDev = Math.Sqrt(((double)n * sigmaxx - sigmax * sigmax) /((double)n * (double)n - 1.0));
ret.YStdDev = Math.Sqrt(((double)n * sigmayy - sigmay * sigmay) /((double)n * (double)n - 1.0));
ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize);
double ssx = sigmaxx-((sigmax*sigmax)/n);
double ssy = sigmayy-((sigmay*sigmay)/n);
double ssxy = sigmaxy-((sigmax*sigmay)/n);
ret.PearsonsR = ssxy / Math.Sqrt( ssx * ssy);
ret.t = ret.PearsonsR / Math.Sqrt( (1-(ret.PearsonsR * ret.PearsonsR))/(n-2));
}
return ret;
}
}
for those who need it in VB.NET
Public Class Regression
Public Class RegressionProcessInfo
Public SampleSize As Integer = 0
Public SigmaError As Double
Public XRangeL As Double = Double.MaxValue
Public XRangeH As Double = Double.MinValue
Public YRangeL As Double = Double.MaxValue
Public YRangeH As Double = Double.MinValue
Public StandardError As Double
Public a As Double
Public b As Double
Public XStdDev As Double
Public YStdDev As Double
Public XMean As Double
Public YMean As Double
Public PearsonsR As Double
Public t As Double
Dim Residuals As ArrayList = New ArrayList
Public Overrides Function ToString() As String
Dim ret As String = "SampleSize=" & Me.SampleSize & vbCrLf & _
"StandardError=" &
Me.StandardError & vbCrLf & _
"y=" &
Me.a & " + " & Me.b & "x"
Return ret
End Function
End Class
Function Regress(ByVal xval() As Double, ByVal yval() As Double)
Dim sigmax As Double = 0.0
Dim sigmay As Double = 0.0
Dim sigmaxx As Double = 0.0
Dim sigmayy As Double = 0.0
Dim sigmaxy As Double = 0.0
Dim x As Double
Dim y As Double
Dim n As Double = 0
Dim ret As RegressionProcessInfo = New RegressionProcessInfo
For arrayitem As Integer = LBound(xval) To UBound(xval)
x = xval(arrayitem)
y = yval(arrayitem)
If x > ret.XRangeH Then
ret.XRangeH = x
End If
If x < ret.XRangeL Then
ret.XRangeL = x
End If
If y > ret.YRangeH Then
ret.YRangeH = y
End If
If y < ret.YRangeL Then
ret.YRangeL = y
End If
sigmax += x
sigmaxx += x * x
sigmay += y
sigmayy += y * y
sigmaxy += x * y
n = n + 1
Next
ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax)
ret.a = (sigmay - ret.b * sigmax) / n
ret.SampleSize =
CType(n, Integer)
'calculate distances for each point (residual)
For arr2 As Integer = LBound(xval) To UBound(xval)
y = yval(arr2)
x = xval(arr2)
Dim yprime As Double = ret.a + ret.b * x 'prediction
Dim Residual As Double = y - yprime
ret.SigmaError += Residual * Residual
Next
ret.XMean = sigmax / n
ret.YMean = sigmay / n
ret.XStdDev = Math.Sqrt((
CType(n * sigmaxx - sigmax * sigmax, Double)) / (CDbl(n) * CDbl(n) - 1.0))
ret.YStdDev = Math.Sqrt((
CType(n * sigmayy - sigmay * sigmay, Double)) / (CDbl(n) * CDbl(n) - 1.0))
ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize)
Dim ssx As Double = sigmaxx - ((sigmax * sigmax) / n)
Dim ssy As Double = sigmayy - ((sigmay * sigmay) / n)
Dim ssxy As Double = sigmaxy - ((sigmax * sigmay) / n)
ret.PearsonsR = ssxy / Math.Sqrt(ssx * ssy)
ret.t = ret.PearsonsR / Math.Sqrt((1 - (ret.PearsonsR * ret.PearsonsR)) / (n - 2))
Return ret
End Function
End Class