
OpenCL code results a 'GPU device not Responding' Error...Any ideas...???
Posted Tuesday, 10 August, 2010 - 08:16 by Hiran47 inI'm new to OpenCL & doing a research based on this context. After using several OpenCL implementations, I found that CLOO is the most user friendly implementation available to-date.
As a part of my research project, I'm doing a PI calculation in my GPU using OpenCL (merely to show that GPU is a better unit for general purpose calculations). I have an ATI Radeon HD 5770 GPU & Everything goes well for inputs under 1000. But whenever the input is larger than 1000 (say 1500), the display goes blank & results the following error.

I tried the same code in an NVIDIA 9800 GPU & the same problem arose for inputs larger than 400.
The same code runs perfectly for any input when using only the CPU.
Here is my Code:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using Cloo; using System.Runtime.InteropServices; using System.Diagnostics; namespace PI_with_Cloo { class Program { static ComputePlatform platform = ComputePlatform.Platforms[0]; #region Kernel Source static string kernelSource = @" #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable int mul_mod(int a, int b, int m) { return (int)(((long)a) * (long)(b)) % m; } #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable int inv_mod(int x, int y) { int q,u,v,a,c,t; u=x; v=y; c=1; a=0; q=v/u; t=c; c=a-q*c; a=t; t=u; u=v-q*u; v=t; while (u!=0) { q=v/u; t=c; c=a-q*c; a=t; t=u; u=v-q*u; v=t; } a=a % y; if (a<0) a=y+a; return a; } #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable int powmod(int a, int b, int m) { int r, aa; r=1; aa=a; while (1) { if ((b & 1) != 0) { r = mul_mod(r, aa, m); } b = b >> 1; if (b == 0) break; aa = mul_mod(aa, aa, m); } return r; } // return true if n is prime #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable bool is_prime(int n) { int r; if ((n % 2) == 0) { return false; } r = (int)sqrt((float)n); for (int i = 3; i <= r; i += 2) { if ((n % i) == 0) { return false; } } return true; } // return the prime number immediatly after n #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable int next_prime(int n) { n++; while (is_prime(n)==false) { n++; } return n; } #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable float frac(float d) { if (d>0) { return d-((int)d); } else { return d+floor(d); } } //start from digit n, return 9 digits #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable float CalculatePiDigits(int n) { int av, vmax, num, den, s, t; int nb; float sum; int a; int i; int k,v,kq,kq2; nb = (int) ((n + 20) * log(10.0) / log(2.0)) ; sum = 0; a=3; for (a = 3; a <= (2 * nb); a = next_prime(a)) { vmax = (int) ((log((float)(2 * nb)) / log((float)a))); av = 1; for (i= 0; i<vmax; i++) { av = av * a; } s = 0; num = 1; den = 1; v = 0; kq = 1; kq2 = 1; for (k=1; k<=nb; k++) { t = k; if (kq >= a) { t = t/a; v--; while ((t % a) == 0) { t = t/a; v--; } kq = 0; } kq++; num = mul_mod(num, t, av); t = 2 * k - 1; if (kq2 >= a) { if (kq2 == a) { t = t/a; v++; while ((t % a) == 0) { t = t/a; v++; } } kq2 = kq2-a; } den = mul_mod(den, t, av); kq2 = kq2 + 2; if (v > 0) { t = inv_mod(den, av); t = mul_mod(t, num, av); t = mul_mod(t, k, av); for (i=v; i<vmax; i++) { t = mul_mod(t, a, av); } s = s + t; if (s >= av) { s = s-av; } } } t = powmod(10, n - 1, av); s = mul_mod(s, t, av); sum = frac(sum + (float) s / (float) av); } return sum; } #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable __kernel void PI(__global int *dval ,__global float *sumval) { int gid = get_global_id(0); int d = dval[gid]; float sum = CalculatePiDigits(d); sumval[gid] = sum; } "; #endregion static void Main(string[] args) { ComputeContextPropertyList pList = new ComputeContextPropertyList(platform); ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu, pList, null, IntPtr.Zero); Console.WriteLine("Platform Vendor: " + platform.Vendor); Console.WriteLine("Platform Name : " + platform.Name); Console.WriteLine("Device Name : " + context.Devices[0].Name); Console.WriteLine(); // Get the Input Value... Console.Write("Enter an Integer for PI calculation: "); int digits = int.Parse(Console.ReadLine()); Stopwatch sw = new Stopwatch(); sw.Start(); digits = (digits - 1) / 9; digits++; Console.WriteLine("Calculating " + (digits * 9 + 1) + " PI digits...."); char[] pival = new char[digits * 9 + 1]; pival[digits * 9] = '0'; int[] dval = new int[digits]; for (int i = 0; i < digits; i++) { dval[i] = 1 + (i * 9); } ComputeBuffer<int> PI_dval = new ComputeBuffer<int>(context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, dval); ComputeBuffer<float> PI_sumval = new ComputeBuffer<float>(context, ComputeMemoryFlags.WriteOnly, digits); // Calculate PI ComputeProgram program = new ComputeProgram(context, new string[] { kernelSource }); program.Build(null, null, null, IntPtr.Zero); ComputeKernel kernel = program.CreateKernel("PI"); kernel.SetMemoryArgument(0, PI_dval); kernel.SetMemoryArgument(1, PI_sumval); ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None); ComputeEventList events = new ComputeEventList(); commands.Execute(kernel, null, new long[] { digits }, null, events); float[] sumval = new float[digits]; GCHandle arrCHandle = GCHandle.Alloc(sumval, GCHandleType.Pinned); commands.Read(PI_sumval, false, 0, digits, arrCHandle.AddrOfPinnedObject(), events); commands.Finish(); arrCHandle.Free(); char[] s; string str; for (int i = 0; i < digits; i++) { int d = 1 + i * 9; str = string.Format("{0:000000000}", (int)(sumval[i] * 1e9)); s = str.ToCharArray(0, 9); for (int j = 0; j < 9; j++) { pival[d - 1 + j] = s[j]; } } sw.Stop(); Console.WriteLine("Calculation completed successfully.\nCompute Time: " + sw.Elapsed.ToString()); Console.Write("The Value is: 3."); for (int i = 0; i < pival.Length; i++) { Console.Write(pival[i].ToString()); } Console.ReadLine(); } } }
(FYI: I found the PI calculation code from the CodeProject)
If you have an idea about a way to get rid of this issue, please help me.
Thanks in advance.


Comments
Found the reason....but the problem continues.... :(
OK, now I have found out that above error is caused by the TDR (a service comes with the OS). This triggers whenever my application forces the GPU to be delayed for more than 2 secs (this happens when the input is larger than 1000).
Now, I don't want to disable the TDR but to code around it. So, how can I modify my code accordingly...? any ideas....?
Re: OpenCL code results a 'GPU device not Responding' ...
Seems like a breakthrough! I've hit this error before (Nvidia) but never quite knew why. Writing down TDR.
My initial guess about your code was "something's wrong with the kernel" but I couldn't see past the single char variables. Anyway, I think you'll have to spit your calculations into smaller chunks and transferring data accordingly between them. Unfortunately, I don't know much of this algorithm to help directly. You'll probably need more ComputeBuffers to hold the temporary results and pass them in and out of the kernel.
Re: OpenCL code results a 'GPU device not Responding' ...
I've had this issue running just plain shader code. I did find it amusing (and typical) that it worked fine on a Linux install and caused the graphics driver to throw this error on Windows.
It was with my Mandelbrot shader (which could be optimised) and I only induced it creating a 4096x4096 texture with >150 passes on my old(ish) Geforce 8600M GT.
I think the only real way around it is to split your work groups up. If possible for example putting in 0-100, then 100-200 then 200-300, etc. Obviously this isn't very efficient, but I don't really see another way, but I've not worked with Cloo or OpenCL yet (it's on my list of stuff to do) so there's probably another way.
Re: OpenCL code results a 'GPU device not Responding' ...
@nythrix & Mincus: Thanx for ur replys guys....will think about ur advices....Thanx again....!!! :)