Geeks With Blogs
Greg Young Greg.ToString()

Linear regression http://www.stat.yale.edu/Courses/1997-98/101/linreg.htm is often used in trying to generalize data, correlate data, and find outliers.

Something I had written some time ago in C# though it also includes VB.NET translation... It can easily be dropped into a datasethelper class and will regress two columns within a dataset returning major variables including Pearson's R http://davidmlane.com/hyperstat/A34739.html PLEASE NOTE THIS IS NOT PRODUCTION CODE AND IS HERE AS AN EXAMPLE.

 public class Regression
 {
  public class RegressionProcessInfo {
   public int SampleSize=0;
   public double SigmaError;
   public double XRangeL = double.MaxValue;
   public double XRangeH = double.MinValue;
   public double YRangeL = double.MaxValue;
   public double YRangeH = double.MinValue;
   public double StandardError;
   public double a;
   public double b;
   public double XStdDev;
   public double YStdDev;
   public double XMean;
   public double YMean;
   public double PearsonsR;
   public double t;
   ArrayList Residuals = new ArrayList();

   public string ToHTML() {
    string ret = "";
    ret += "y = " + a + " + " + b + "x" + "
";
    ret += "Sample size = " + SampleSize + "
";
    ret += "X Mean = " + XMean + "
";
    ret += "X Range = " + XRangeL + " - " + XRangeH + "
";
    ret += "X STDDEV = " + this.XStdDev + "
";
    ret += "Y Mean = " + YMean + "
";
    ret += "Y Range = " + YRangeL + " - " + YRangeH + "
";
    ret += "Y STDDEV = " + this.YStdDev + "
";
    ret += "t = " + this.t + "
";
    ret += "r = " + this.PearsonsR + "
";
    ret += "r² = " + this.PearsonsR * this.PearsonsR + "
";
    if(this.PearsonsR * this.PearsonsR < .25) {
     ret += "Low r² scores represent LOW correlation between variables !
";
    }
    ret += "
Total Error = " + this.SigmaError + "
";
    ret += "Standard Error sqrt((y - y')² / n); = " + this.StandardError + "
";
    if(this.StandardError > this.YStdDev) {
     ret += "THIS IS NOT A REASONABLE PREDICTION AS THE STANDARD ERROR IS LARGER THAN THE FIRST STANDARD DEVIATION OF THE Y VARIABLE";
    }
    return ret;
   }
  }

  public static RegressionProcessInfo Regress(DataSet ds, string XColumn, string YColumn) {
   double sigmax = 0.0;
   double sigmay = 0.0;
   double sigmaxx = 0.0;
   double sigmayy = 0.0;
   double sigmaxy = 0.0;
   double x;
   double y;
   double n = 0;

   RegressionProcessInfo ret = new RegressionProcessInfo();

   if(ds != null) {
    foreach(DataRow dr in ds.Tables[0].Rows) {
     x = double.Parse(dr[XColumn].ToString());
     y = double.Parse(dr[YColumn].ToString());
     if(x > ret.XRangeH) ret.XRangeH = x;
     if(x < ret.XRangeL) ret.XRangeL = x;
     if(y > ret.YRangeH) ret.YRangeH = y;
     if(y < ret.YRangeL) ret.YRangeL = y;
     sigmax += x;
     sigmaxx += x * x;
     sigmay += y;
     sigmayy += y * y;
     sigmaxy += x * y;
     n++;
    }
    ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax);
    ret.a = (sigmay - ret.b * sigmax) / n;
    ret.SampleSize = (int) n;
    
    foreach(DataRow dr in ds.Tables[0].Rows) {
     //calculate distances for each point (residual)
     y = double.Parse(dr[YColumn].ToString());
     x = double.Parse(dr[XColumn].ToString());
     double yprime = ret.a + ret.b * x; //prediction
     double Residual = y - yprime;
     ret.SigmaError += Residual * Residual;
    }
    ret.XMean = sigmax / n;
    ret.YMean = sigmay / n;
    ret.XStdDev = Math.Sqrt(((double)n * sigmaxx - sigmax * sigmax) /((double)n * (double)n - 1.0));
    ret.YStdDev = Math.Sqrt(((double)n * sigmayy - sigmay * sigmay) /((double)n * (double)n - 1.0));
    ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize);
    double ssx = sigmaxx-((sigmax*sigmax)/n);
    double ssy = sigmayy-((sigmay*sigmay)/n);
    double ssxy = sigmaxy-((sigmax*sigmay)/n);
    ret.PearsonsR = ssxy / Math.Sqrt( ssx * ssy);
    ret.t = ret.PearsonsR / Math.Sqrt( (1-(ret.PearsonsR * ret.PearsonsR))/(n-2));
    

   }
   return ret;
  }
 }

 

 

for those who need it in VB.NET

Public Class Regression

Public Class RegressionProcessInfo

Public SampleSize As Integer = 0

Public SigmaError As Double

Public XRangeL As Double = Double.MaxValue

Public XRangeH As Double = Double.MinValue

Public YRangeL As Double = Double.MaxValue

Public YRangeH As Double = Double.MinValue

Public StandardError As Double

Public a As Double

Public b As Double

Public XStdDev As Double

Public YStdDev As Double

Public XMean As Double

Public YMean As Double

Public PearsonsR As Double

Public t As Double

Dim Residuals As ArrayList = New ArrayList

Public Overrides Function ToString() As String

Dim ret As String = "SampleSize=" & Me.SampleSize & vbCrLf & _

"StandardError=" & Me.StandardError & vbCrLf & _

"y=" & Me.a & " + " & Me.b & "x"

Return ret

End Function

End Class

Function Regress(ByVal xval() As Double, ByVal yval() As Double)

Dim sigmax As Double = 0.0

Dim sigmay As Double = 0.0

Dim sigmaxx As Double = 0.0

Dim sigmayy As Double = 0.0

Dim sigmaxy As Double = 0.0

Dim x As Double

Dim y As Double

Dim n As Double = 0

Dim ret As RegressionProcessInfo = New RegressionProcessInfo

For arrayitem As Integer = LBound(xval) To UBound(xval)

x = xval(arrayitem)

y = yval(arrayitem)

If x > ret.XRangeH Then

ret.XRangeH = x

End If

If x < ret.XRangeL Then

ret.XRangeL = x

End If

If y > ret.YRangeH Then

ret.YRangeH = y

End If

If y < ret.YRangeL Then

ret.YRangeL = y

End If

sigmax += x

sigmaxx += x * x

sigmay += y

sigmayy += y * y

sigmaxy += x * y

n = n + 1

Next

ret.b = (n * sigmaxy - sigmax * sigmay) / (n * sigmaxx - sigmax * sigmax)

ret.a = (sigmay - ret.b * sigmax) / n

ret.SampleSize = CType(n, Integer)

'calculate distances for each point (residual)

For arr2 As Integer = LBound(xval) To UBound(xval)

y = yval(arr2)

x = xval(arr2)

Dim yprime As Double = ret.a + ret.b * x 'prediction

Dim Residual As Double = y - yprime

ret.SigmaError += Residual * Residual

Next

ret.XMean = sigmax / n

ret.YMean = sigmay / n

ret.XStdDev = Math.Sqrt((CType(n * sigmaxx - sigmax * sigmax, Double)) / (CDbl(n) * CDbl(n) - 1.0))

ret.YStdDev = Math.Sqrt((CType(n * sigmayy - sigmay * sigmay, Double)) / (CDbl(n) * CDbl(n) - 1.0))

ret.StandardError = Math.Sqrt(ret.SigmaError / ret.SampleSize)

Dim ssx As Double = sigmaxx - ((sigmax * sigmax) / n)

Dim ssy As Double = sigmayy - ((sigmay * sigmay) / n)

Dim ssxy As Double = sigmaxy - ((sigmax * sigmay) / n)

ret.PearsonsR = ssxy / Math.Sqrt(ssx * ssy)

ret.t = ret.PearsonsR / Math.Sqrt((1 - (ret.PearsonsR * ret.PearsonsR)) / (n - 2))

Return ret

End Function

End Class

Posted on Tuesday, January 10, 2006 3:41 PM | Back to top


Comments on this post: Linear regression

# re: Linear regression
Requesting Gravatar...
Hi, thanks for this, I'm trying to use this code to test a curve i want to be able to identify 1 or more points which are out of step with the rest. don't suppose you have any samples
Left by Duncan on Jul 25, 2007 5:54 AM

# re: Linear regression
Requesting Gravatar...
When you say "out of step" what do you mean by out of step? Generally the way of doing this would be to plot a curve using something like Levenberg-Marquardt http://en.wikipedia.org/wiki/Levenberg-Marquardt_algorithm then you would just look at what you prediction was compared with what the actual data was (when they are far away you have identified such a position).

Most curve fitting is trying to minimize the sum of the squares of the differences between your prediction and the actual data ... so just find the squares that are the largest.

If you are looking for a simpler method you may was to just use things like standard deviation.
Left by Greg Young on Jul 25, 2007 12:18 PM

# re: Linear regression
Requesting Gravatar...
Thanks bro....
Left by Mike on Aug 03, 2011 4:29 PM

# re: Linear regression
Requesting Gravatar...
Hi Greg. I know this blog is old but the code is exactly what I am looking for. However, being a newbee to classes I have a bit of a problem.
I have added the class to my VB.NET program and have two arrays of type Double - X() and Y().
I have tried:
Dim R As New Regression()
RegressionStats = R.Regress(X,Y)

but get an error -
Conversion from type 'RegressionProcessInfo' to type 'String' is not valid.
How do I use the class?

Regards
Phil
Left by PhilPQ on Jan 30, 2013 5:45 PM

Your comment:
 (will show your gravatar)


Copyright © Greg Young | Powered by: GeeksWithBlogs.net