frontier's picture

"Busy-waiting" thread lock whenever graphic card is working (GL.Finish || SwapBuffers) 100% cpu usage

Hello

i have been working a small first person shooter game for a while, but i have a strange 100% cpu usage (on one core) problem i've been trying to figure out for some time, much experimentation and googling resulting in nothing.

It seems whenever the graphic card is busy rendering a frame while it holds my thread it does "busy waiting / cpu burning" until its done, and this happens weather its from a GL.Finish call or Swapbuffers(without having called gl.finish myself)

I refractored my code into a simple example whoose sole purpose is to illustrate and reproduce the problem:

namespace Project1
{
    class Game : GameWindow
    {
        int ListId;
        private DateTime RenderTimeCpu;
        float rotation = 0.0f;
        public Game()
            : base(800, 600, GraphicsMode.Default, "Test cpu load!")
        {
        }
        public void RunGame()
        {
            Visible = true;
            OnLoad(new EventArgs());
            OnResize(new EventArgs());
            while (true)
            {
                base.ProcessEvents();
                if (Exists && !IsExiting)
                {
                    OnUpdateFrame(new FrameEventArgs());
                    OnRenderFrame(new FrameEventArgs());
                    Thread.Sleep(35);
                    Console.WriteLine("Hello --------------");
                }
                else
                {
                    break;
                }
            }
        }
        protected override void OnLoad(EventArgs e)
        {
            base.OnLoad(e);
            GL.ClearColor(0.1f, 0.2f, 0.5f, 0.0f);
            GL.Enable(EnableCap.DepthTest);
 
            // generate displaylists map (blocks ar)
            int NumQuadsPerList = 200000;
            ListId = GL.GenLists(1);
            Random rnd = new Random();
            GL.NewList(ListId, ListMode.Compile);
            GL.Begin(BeginMode.Quads);
                for (int x = 0; x < Math.Sqrt(NumQuadsPerList); x++) {
                    int tmpx = x - 250; // offset
                    for (int y = 0; y < Math.Sqrt(NumQuadsPerList); y++) {
                        int tmpy = y - 250; // offset
                        int tmpzoffset = rnd.Next(0, 5);
                        GL.Color3(new Vector3((float)rnd.Next(10) / 10, (float)rnd.Next(10) / 10, (float)rnd.Next(10) / 10));
                        GL.Vertex3(new Vector3(-0.5f + (x * 0.8f) + tmpx, -0.5f + (y * 0.8f) + tmpy, 0.5f + (tmpzoffset * 2f)));
                        GL.Vertex3(new Vector3(-0.5f + (x * 0.8f) + tmpx, 0.5f + (y * 0.8f) + tmpy, 0.5f + (tmpzoffset * 2f)));
                        GL.Vertex3(new Vector3(0.5f + (x * 0.8f) + tmpx, 0.5f + (y * 0.8f) + tmpy, 0.5f + (tmpzoffset * 2f)));
                        GL.Vertex3(new Vector3(0.5f + (x * 0.8f) + tmpx, -0.5f + (y * 0.8f) + tmpy, 0.5f + (tmpzoffset * 2f)));
                    }
                }
            GL.End();
            GL.EndList();
 
        }
        protected override void OnResize(EventArgs e)
        {
            base.OnResize(e);
 
            GL.Viewport(ClientRectangle.X, ClientRectangle.Y, ClientRectangle.Width, ClientRectangle.Height);
 
            Matrix4 projection = Matrix4.CreatePerspectiveFieldOfView((float)Math.PI / 4, Width / (float)Height, 1.0f, 5000.0f);
            GL.MatrixMode(MatrixMode.Projection);
            GL.LoadMatrix(ref projection);
        }
        protected override void OnUpdateFrame(FrameEventArgs e)
        {
            base.OnUpdateFrame(e);
 
            if (Keyboard[Key.Escape]) {
                Exit();
            }
            if (Keyboard[Key.F11]) {
                if (this.WindowState == WindowState.Fullscreen) {
                    this.WindowState = WindowState.Normal;
                } else {
                    this.WindowState = WindowState.Fullscreen;
                }
            }
        }
        protected override void OnRenderFrame(FrameEventArgs e)
        {
            RenderTimeCpu = DateTime.Now; // resets and starts counting ms
            base.OnRenderFrame(e);
            int TimesToCallList = 35;
            rotation += 0.05f;
            GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
            Matrix4 GeneralMatrix;
            Matrix4 modelview = Matrix4.LookAt(new Vector3((float)Math.Cos(rotation) * (TimesToCallList * 100), 500, (float)Math.Sin(rotation) * (TimesToCallList * 40)), new Vector3(0, 0, 20 * TimesToCallList), Vector3.UnitY);
            GL.MatrixMode(MatrixMode.Modelview);
            GL.LoadMatrix(ref modelview);
            GL.PushMatrix();
            GeneralMatrix = Matrix4.CreateTranslation(0, 0, 20);
            for (int i = 0; i < TimesToCallList; i++ ) {
                GL.MultMatrix(ref GeneralMatrix);
                GL.CallList(ListId);
            }
            GL.Finish();
            SwapBuffers();
            Console.WriteLine("RenderTimeCpuMs:" + ((TimeSpan)(DateTime.Now - RenderTimeCpu)).TotalMilliseconds); // displays ms so far
        }
        /// <summary>
        /// The main entry point for the application.
        /// </summary>
        [STAThread]
        static void Main()
        {
            using (Game game = new Game())
            {
                //game.Run(30.0, 120);
                game.RunGame();
            }
        }
    }
}

// in clude/usings are there ofc in the top i just excluded them here

All this does it create a large displaylist with an assortment of quads, you can modify NumQuadsPerList for how many it puts into the list. Then in onrenderframe it calls that list for as many times as TimesToCallList;
It also writes to console the ms it takes between the:
RenderTimeCpu = DateTime.Now; call
And the:
Console.WriteLine("RenderTimeCpuMs:" + ((TimeSpan)(DateTime.Now - RenderTimeCpu)).TotalMilliseconds);

Here is the interesting part!
I have tried the following things with this very code to try and triangulate the problem

Experiment 1 !
Set the number of quads and the number of times the list is drawn, to an amount where in your graphic card can finish it in 1 ms or less. This can be verified with a simple test:

RenderTimeCpu = DateTime.Now; // resets and starts counting ms
GL.Finish();
Console.WriteLine("RenderTimeCpuMs:" + ((TimeSpan)(DateTime.Now - RenderTimeCpu)).TotalMilliseconds); // displays ms so far

Since we force it to hold the thread until rendering is done, and we count ms inbetween, we know how long it took our graphic card! Get it to 0-1 ms.

At this point cpu usage should be negligable and only 0-5% in taskmanager, so it is for me atleast

Experiment 2 !
Now try and change numquadsinlist and timestocalllist to a point where it takes 10-15 ms for the card to render, verify this with experiment1
Now look at cpu usage in task manager? for me its 50% since i have 2 cores, it completely burns the thread while card is rendering

Experiment 3 !
Comment out the GL.Finish call and wrap the timing calls around SwapBuffers
Again it should be same, full cpu usage on one core.
However, the time it takes for it to render a certain amount (verify with experiment1) all that time is now spent when swapbuffer is called on my computer
Is opentk meant to block and hold the thread during swapbuffer? should that not ONLY be on GL.Finish?

Experiment 4 !
Now uncomment GL.Finish, but keep timing functions around SwapBuffers
I still have full cpu usage on one core, but console says swapbuffers call only took 0-1 ms

Final Experiment
For this id like you to put RenderTimeCpu = DateTime.Now; as the FIRST call inside onrender function. And put the console.writeline just above gl.finish and gl.swapbuffers eg:

        protected override void OnRenderFrame(FrameEventArgs e)
        {
            RenderTimeCpu = DateTime.Now; // resets and starts counting ms
            base.OnRenderFrame(e);
            int TimesToCallList = 35;
            rotation += 0.05f;
            GL.Clear(ClearBufferMask.ColorBufferBit | ClearBufferMask.DepthBufferBit);
            Matrix4 GeneralMatrix;
            Matrix4 modelview = Matrix4.LookAt(new Vector3((float)Math.Cos(rotation) * (TimesToCallList * 100), 500, (float)Math.Sin(rotation) * (TimesToCallList * 40)), new Vector3(0, 0, 20 * TimesToCallList), Vector3.UnitY);
            GL.MatrixMode(MatrixMode.Modelview);
            GL.LoadMatrix(ref modelview);
            GL.PushMatrix();
            GeneralMatrix = Matrix4.CreateTranslation(0, 0, 20);
            for (int i = 0; i < TimesToCallList; i++ ) {
                GL.MultMatrix(ref GeneralMatrix);
                GL.CallList(ListId);
            }
            Console.WriteLine("RenderTimeCpuMs:" + ((TimeSpan)(DateTime.Now - RenderTimeCpu)).TotalMilliseconds); // displays ms so far
            GL.Finish();
            SwapBuffers();
        }

This is simply a unit test to verify that its not my matrix code or the loop of GL.CallList that actually eats the cpu, for me this gives 0 ms on every loop

but ofc full cpu load :(

Conclusion
I imagine problem is whenever graphic card is actually rendering, weather by my GL.Finish call, or when swapbuffer(and opentk i guess calls it itself or something), opentk/opengl/somthing along the chain starts burning the cpu waiting
But i havent a clue why or what to do :O

At first i used the standard Game example from your site using the built in Run function with a renderfrequency and updatefrequency. But i had the same problem no matter what i did, weather i left renderfrequency to 0 so it called as many times, or if i sat vsynch, or whatever. No matter what i did i got same problem whenever it took serious ms for GL.Finish to call, meaning the graphic card cant render it almost instantly.
So i simply made my own run function (RunGame) to see if it was the opentk renderupdatethread system that caused it, but nope, changed nothing.

Does anyone have any clues?

This was the same in opentk 1.0 from your front page, then i dled the latest from trunk (yesterday), compiled and switched to it, but it changed absolutely nothing.
I have windows 7 32 bit and a geforce 8800 somthing card. Here is my dxdiag: http://pastebin.com/J6xwpf7m
i use .net framework also and visual studio 2010;

Hope anyone can help

UPDATE!!!!!!:
I send my testapplication to one of my comrades and guided him trough running the same tests and report back to me the outcome. This confirms that it does not happen always.
Even when we put so many quads on the screen each frame it took 125 ms for it to render each, his cpu stayed exactly the same, 0-2%

His comp is windows 7 64 bit, and graphics card: radeon 6870

So one of 2 options seems to remain:
1: this is a error/bug in my drivers/card causing it entirely and opentk has no bugs at all
2: this is a bug in the opentk framework that only applies on certain hardware/drivers, in which i have correct setup to trigger it

If anyone believes the latter is the case, im willing to serve as a unit test machine and run any debug/test code you wanna send me and give you back the feedback, so we can find the bug and fix it for the framework

Update 2
I had 3 other comrades test it as well, 2 who had radeon graphics cards, where in none of them got the error, no matter how many quads drawn and how long it took, cpu usage stayed at virtually nothing. but the 3rd one had a geforce card slightly newer than mine, and bingo same problem.

A pattern seems to appear


Comments

Comment viewing options

Select your preferred way to display the comments and click "Save settings" to activate your changes.
Robmaister's picture

DateTime.Now has an accuracy of something insane like 10ms, I would recommend against using it. Instead, OnRenderFrame and OnUpdateFrame pass in frametime from FrameEventArgs, this value is far more accurate. If for some reason you absolutely don't want to use that, use System.Diagnostics.Stopwatch, it also has far better accuracy than DateTime.Now.

Anyways, it looks like you're bypassing GameWindow's usual message loop and adding a static sleep of about 35ms in your own new message loop (RunGame()). This seems to be the only thing about your current project that's different from how any of mine work, so I recommend removing the RunGame method and go back to game.Run().

frontier's picture

Thanks for the reply but it seems you did not even read my message?

My problem is not my game loop. My problem is that when i call GL.FInish while opentk/opengl blocks the thread (while waiting for graphic card to finish everything) it gives 100% cpu usage on one core on my cpu

Which only seems to happen on computers with geforce cards, and not on comps with radeon cards

Precision of DateTime.now has nothing to do with this.

Robmaister's picture

I'm just trying to narrow down the variables here, there are many possible reasons for the thread to hang on that method. Based on my own projects using OpenTK that run just fine, I'm spotting differences and trying to narrow down which one is causing the issue. One of the biggest differences I'm seeing your own custom run loop, with a static thread sleep in there. You created it to see if it was an issue with Run, and it wasn't, so I recommend changing it back. GL.Finish() is never called in my code, I simply call SwapBuffers to end each frame, and my projects don't hang with 100% CPU time each frame. Consider removing GL.Finish() entirely and just use SwapBuffers().

Also make sure you have the latest drivers for your graphics cards, it may just be a bug in an older nVidia driver... http://www.nvidia.com/Download/index.aspx

frontier's picture

Hello robmaister, i appreciate your help :)

Anyway i did start out with the standard gameloop, but i had exact same problem, i wrote my own one as a means to try and narrow down the problem to see if it was the built in loop system that caused it. But my own gave the exact same result, then i simply did not bother to switch back.

Anyway can i ask you to try something? try to put a timer around a gl.finish call.. then call so many draw operations that it takes atleast 20 ms (as measured by the timer) and then check your cpu usage? that way you can test and see if your computer has the same problem. Because if you do get a high cpu usage then that proves its the framework?

would you run that test for me? and what graphics card do you have?

Again i have had many ppl run this test and the results speak for them selves, whenever its on a computer with a geforce card it eaets 100% of a core while its drawing, but if its radeon it does not.

Also if i just use swap buffers and dont call gl.finish i get same cpu results

Robmaister's picture

I just ran the test class on an i7-2600k with a GTX570, I'm at 10% CPU usage, which is more than the 2% I'm usually at without the test. I commented everything in OnRenderFrame out except for GL.Clear and SwapBuffers. CPU usage stays at around 2%. Uncommenting GL.Finish raises it up to ~12%.

If I comment GL.Finish again and only uncomment the for loop (except for the MultMatrix call), it stays at around 2%. Uncommenting everything except for GL.Finish 15% usage. If I reduce the TimesToCallList to 10, all the CPU usage magically goes away again.

The issue is one of two things: you're either doing a ton of CPU-bound work , or it takes a lot of time just to process the 8 million vertices you defined. When rendering with OpenGL, the CPU and GPU are not synchronized, so when you make a GL call, it just queues it up for the GPU to do after it's done with it's current work. At the end of each frame it burns CPU time until the GPU is completely done with everything. Perhaps the ATI implementation does some sort of thread sleeping while the nVidia implementation burns CPU time. As far as my tests go, OpenTK itself has no issue with GL.Finish or SwapBuffers alone. If you put enough load on the GPU, it seems to trigger some sort of increase in CPU time, but that is not inherently an issue with OpenTK. GL.Finish and SwapBuffers directly invoke the system's implementations. I would recommend trying out this same program in C or C++ to see if the same processor usage occurs on nVidia cards.

frontier's picture

"The issue is one of two things: you're either doing a ton of CPU-bound work , or it takes a lot of time just to process the 8 million vertices you defined"

But i dont do anything with cpu bound work? i only calculate and define the vertices in the onload event, once. to generate the display list?

Is the displaylist not stored in the cards memory and such?

All i do in the render loop is make a single call to it for every "TimesToCallList" that should take up no cpu
..

Anyway swapbuffers take the thread for me also in the example, if i comment out gl.finish and put my timer around swapbuffers, before and after like this:

starttimer
SwapBuffers();
console.writeline(time so far)

It still holds it for the amount of ms it takes the card to do the rendering. And same cpu usage ofc

But even if it didnt, i still dont think its the way it should be
It makes absolutely no sense that when it holds the thread from gl.finish it burns it No way can that be the way its spose to be!
I mean what if the computer only has 1 core? then ur reduced to only using like 0.001% of our gpus capabilities so as to make sure it wont burn all your cpu while its working, OR you have to not use gl.finish in which case you loose precision control over when to render and when not to.
I refuse to believe its meant to be this way, it simply does not make sense

Robmaister's picture

At this point your GPU has to process 28,000,000 vertices in both immediate mode and as quads. I know you have display lists set up, but not even the most demanding games put 42,000,000 triangles on the screen, and if they do they'd be using triangle strips or triangles, not quads. I haven't dealt with immediate mode in a really long time, and even then I moved to VBOs very quickly, so I'm not sure if display lists are clientside or serverside, but if I raise the display list size enough, the program eats up 1GB of RAM on startup and renders very slowly. As far as I can tell, display lists are still doing something clientside - which means either processing triangles is taking too long or display lists do some operation clientside, which, when repeated several a few million times, will use up CPU.

I decided to test this idea, and I replaced your display list with a single VBO. While it used a fraction of the amount of RAM the display list was using, it was still slow and hogging up CPU usage. I slowly decreased the number of draw calls until I got a number around 15 that CPU usage started lowering. No good, it's still slow.

So I tested one more thing, and this IS the solution. Right now you have 35 separate draw calls to 800,000 vertices each in a for loop. This means that it has to queue up 35 draw calls and pause the GPU as it's rendering each one. Multiplying the amount of data stored in the display list by 35 drops CPU usage down to almost nothing. Also, to make sure this works with display lists, I did a little bit of searching and found GL.CallLists. I created a 35-item int[] with each value being ListId and passed the array to GL.CallLists and the CPU usage also drops down to nothing. You are essentially queuing up a lot of really complicated methods instead of compressing it into one method, which is what OpenGL prefers.

And with double-buffered applications (which OpenTK automatically sets up), there is no need for GL.Flush or GL.Finish, as SwapBuffers already flushes all OpenGL commands before switching the buffers. There is absolutely no loss of precision of what gets rendered to which buffer if you call SwapBuffers at the end of each frame. There is absolutely no need to call it at the end of a frame. Don't believe me?
http://stackoverflow.com/questions/2143240/opengl-glflush-vs-glfinish

Robmaister's picture

Also I ran gDEBugger on the test application, both GL.Finish and making 35 GL.CallList calls a frame both attribute quite a bit of CPU usage to the program.