Blog Stats
  • Posts - 99
  • Articles - 5
  • Comments - 43
  • Trackbacks - 108

 

Linear regression

Linear regression http://www.stat.yale.edu/Courses/1997-98/101/linreg.htm is often used in trying to generalize data, correlate data, and find outliers.

Something I had written some time ago in C# though it also includes VB.NET translation... It can easily be dropped into a datasethelper class and will regress two columns within a dataset returning major variables including Pearson's R http://davidmlane.com/hyperstat/A34739.html PLEASE NOTE THIS IS NOT PRODUCTION CODE AND IS HERE AS AN EXAMPLE.

 public class Regression
 {
  public class RegressionProcessInfo {
   public int SampleSize=0;
   public double SigmaError;
   public double XRangeL = double.MaxValue;
   public double XRangeH = double.MinValue;
   public double YRangeL = double.MaxValue;
   public double YRangeH = double.MinValue;
   public double StandardError;
   public double a;
   public double b;
   public double XStdDev;
   public double YStdDev;
   public double XMean;
   public double YMean;
   public double PearsonsR;
   public double t;
   ArrayList Residuals = new ArrayList();

   public string ToHTML() {
    string ret = "";
    ret += "y = " + a + " + " + b + "x" + "
";
    ret += "Sample size = " + SampleSize + "
";
    ret += "X Mean = " + XMean + "
";
    ret += "X Range = " + XRangeL + " - " + XRangeH + "
";
    ret += "X STDDEV = " + this.XStdDev + "
";
    ret += "Y Mean = " + YMean + "
";
    ret += "Y Range = " + YRangeL + " - " + YRangeH + "
";
    ret += "Y STDDEV = " + this.YStdDev + "
";
    ret += "t = " + this.t + "
";
    ret += "r = " + this.PearsonsR + "
";
    ret += "r² = " + this.PearsonsR * this.PearsonsR + "
";
    if(this.PearsonsR * this.PearsonsR < .25) {
     ret += "Low r² scores represent LOW correlation between variables !
";
    }
    ret += "
Total Error = " + this.SigmaError + "
";
    ret += "Standard Error sqrt((y - y')² / n); = " + this.StandardError + "
";
    if(this.StandardError > this.YStdDev) {
     ret += "THIS IS NOT A REASONABLE PREDICTION AS THE STANDARD ERROR IS LARGER THAN THE FIRST STANDARD DEVIATION OF THE Y VARIABLE";
    }
    return ret;
   }
  }

  public static RegressionProcessInfo Regress(DataSet ds, string XColumn, string YColumn) {
   double sigmax = 0.0;
   double sigmay = 0.0;
   double sigmaxx = 0.0;
   double sigmayy = 0.0;
   double sigmaxy = 0.0;
   double x;
   double y;
   double n = 0;

   RegressionProcessInfo ret = new RegressionProcessInfo();

   if(ds != null) {
    foreach(DataRow dr in ds.Tables[0].Rows) {
     x = double.Parse(dr[XColumn].ToString());
     y = double.Parse(dr[YColumn].ToString());
     if(x > ret.XRangeH) ret.XRangeH = x;
     if(x < ret.XRangeL) ret.XRangeL = x;
     if(y > ret.YRangeH) ret.YRangeH = y;
     if(y < ret.YRangeL) ret.YRangeL = y;
     sigmax += x;
     sigmaxx += x * x;
     sigmay += y;
     sigmayy += y * y;
     sigmaxy += x * y;
     n++;
    }
    ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax);
    ret.a = (sigmay - ret.b * sigmax) / n;
    ret.SampleSize = (int) n;
    
    foreach(DataRow dr in ds.Tables[0].Rows) {
     //calculate distances for each point (residual)
     y = double.Parse(dr[YColumn].ToString());
     x = double.Parse(dr[XColumn].ToString());
     double yprime = ret.a + ret.b * x; //prediction
     double Residual = y - yprime;
     ret.SigmaError += Residual * Residual;
    }
    ret.XMean = sigmax / n;
    ret.YMean = sigmay / n;
    ret.XStdDev = Math.Sqrt(((double)n * sigmaxx - sigmax * sigmax) /((double)n * (double)n - 1.0));
    ret.YStdDev = Math.Sqrt(((double)n * sigmayy - sigmay * sigmay) /((double)n * (double)n - 1.0));
    ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize);
    double ssx = sigmaxx-((sigmax*sigmax)/n);
    double ssy = sigmayy-((sigmay*sigmay)/n);
    double ssxy = sigmaxy-((sigmax*sigmay)/n);
    ret.PearsonsR = ssxy / Math.Sqrt( ssx * ssy);
    ret.t = ret.PearsonsR / Math.Sqrt( (1-(ret.PearsonsR * ret.PearsonsR))/(n-2));
    

   }
   return ret;
  }
 }

 

 

for those who need it in VB.NET

Public Class Regression

Public Class RegressionProcessInfo

Public SampleSize As Integer = 0

Public SigmaError As Double

Public XRangeL As Double = Double.MaxValue

Public XRangeH As Double = Double.MinValue

Public YRangeL As Double = Double.MaxValue

Public YRangeH As Double = Double.MinValue

Public StandardError As Double

Public a As Double

Public b As Double

Public XStdDev As Double

Public YStdDev As Double

Public XMean As Double

Public YMean As Double

Public PearsonsR As Double

Public t As Double

Dim Residuals As ArrayList = New ArrayList

Public Overrides Function ToString() As String

Dim ret As String = "SampleSize=" & Me.SampleSize & vbCrLf & _

"StandardError=" & Me.StandardError & vbCrLf & _

"y=" & Me.a & " + " & Me.b & "x"

Return ret

End Function

End Class

Function Regress(ByVal xval() As Double, ByVal yval() As Double)

Dim sigmax As Double = 0.0

Dim sigmay As Double = 0.0

Dim sigmaxx As Double = 0.0

Dim sigmayy As Double = 0.0

Dim sigmaxy As Double = 0.0

Dim x As Double

Dim y As Double

Dim n As Double = 0

Dim ret As RegressionProcessInfo = New RegressionProcessInfo

For arrayitem As Integer = LBound(xval) To UBound(xval)

x = xval(arrayitem)

y = yval(arrayitem)

If x > ret.XRangeH Then

ret.XRangeH = x

End If

If x < ret.XRangeL Then

ret.XRangeL = x

End If

If y > ret.YRangeH Then

ret.YRangeH = y

End If

If y < ret.YRangeL Then

ret.YRangeL = y

End If

sigmax += x

sigmaxx += x * x

sigmay += y

sigmayy += y * y

sigmaxy += x * y

n = n + 1

Next

ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax)

ret.a = (sigmay - ret.b * sigmax) / n

ret.SampleSize = CType(n, Integer)

'calculate distances for each point (residual)

For arr2 As Integer = LBound(xval) To UBound(xval)

y = yval(arr2)

x = xval(arr2)

Dim yprime As Double = ret.a + ret.b * x 'prediction

Dim Residual As Double = y - yprime

ret.SigmaError += Residual * Residual

Next

ret.XMean = sigmax / n

ret.YMean = sigmay / n

ret.XStdDev = Math.Sqrt((CType(n * sigmaxx - sigmax * sigmax, Double)) / (CDbl(n) * CDbl(n) - 1.0))

ret.YStdDev = Math.Sqrt((CType(n * sigmayy - sigmay * sigmay, Double)) / (CDbl(n) * CDbl(n) - 1.0))

ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize)

Dim ssx As Double = sigmaxx - ((sigmax * sigmax) / n)

Dim ssy As Double = sigmayy - ((sigmay * sigmay) / n)

Dim ssxy As Double = sigmaxy - ((sigmax * sigmay) / n)

ret.PearsonsR = ssxy / Math.Sqrt(ssx * ssy)

ret.t = ret.PearsonsR / Math.Sqrt((1 - (ret.PearsonsR * ret.PearsonsR)) / (n - 2))

Return ret

End Function

End Class


Feedback

# re: Linear regression

Gravatar Hi, thanks for this, I'm trying to use this code to test a curve i want to be able to identify 1 or more points which are out of step with the rest. don't suppose you have any samples 7/25/2007 5:54 AM | Duncan

# re: Linear regression

Gravatar When you say "out of step" what do you mean by out of step? Generally the way of doing this would be to plot a curve using something like Levenberg-Marquardt http://en.wikipedia.org/wiki/Levenberg-Marquardt_algorithm then you would just look at what you prediction was compared with what the actual data was (when they are far away you have identified such a position).

Most curve fitting is trying to minimize the sum of the squares of the differences between your prediction and the actual data ... so just find the squares that are the largest.

If you are looking for a simpler method you may was to just use things like standard deviation. 7/25/2007 12:18 PM | Greg Young

Post a comment





 

Please add 3 and 6 and type the answer here:

 

 

Copyright © Greg Young