ptrace(PTRACE_CONT) cannot resume just-attached processes -
i writing program needs attach other processes (which might created previous instance of program) , watch when terminate.
if keep program running during lifetime of processes created, works fine; if start process, kill program, restart it, created process remains in stopped state forever (it seems ptrace(ptrace_cont,...) can not resume it). code snippet attached bellow:
static int exitflag = 0; static void sighandler (int/* signum */) { exitflag = 1; } int jsfnode::run (void) { /* load jobs */ { vector <jobinfo2> jobs; loadstruct <vector <jobinfo2> > ( jobfile (), jobs); (unsigned i=0 ; i<jobs.size () ; i++) { jobinfo2& info = jobs [i]; string name = info.parm.name; if (m_jobs.find (name) == m_jobs.end ()) { job2& job = m_jobs [name]; job.info = info; /* trace can wait() */ switch (info.state) { case js2active: case js2canceling: case js2suspending: if (ptrace (ptrace_attach, info.pid, 0, 0)) jdebug ("ptrace_attach failed for: %d (%s)\n", info.pid, strerror (errno)); default: break; } } } } /* run until signaled stop */ signal (sigint, sighandler); while (!exitflag) sleep (1); /* save jobs */ { vector <jobinfo2> jobs; (map <string, job2>::iterator it=m_jobs.begin () ; it!=m_jobs.end () ; it++) { jobinfo2& info = it->second.info; ptrace (ptrace_detach, info.pid, null, null); jobs.push_back (info); } savestruct <vector <jobinfo2> > ( jobfile (), jobs); } return 0; } void jsfnode::startjob (job2 & job) { jobparm2 parm = job.info.parm; jdebug ("starting \"%s\"..\n", parm.name.c_str()); /* uid of run-as user */ uid_t uid = 0; /* run root if specified user invalid */ struct passwd * pwe = getpwnam (parm.user.c_str()); if (pwe != null) uid = pwe->pw_uid; /* prepare script file */ string scriptfile = m_workdir+"/"+parm.name+"_scriptfile"; ofstream ofscriptfile (scriptfile.c_str()); ofscriptfile << parm.script; ofscriptfile.close(); chown (scriptfile.c_str(), uid, uid); chmod (scriptfile.c_str(), s_irwxu|s_irwxg|s_irwxo); /* prepare mpimachinefile */ string machinefile = m_workdir+"/"+parm.name+"_machinefile"; ofstream ofmachinefile (machinefile.c_str()); (resource::iterator it=parm.res.begin () ; it!=parm.res.end () ; it++) ofmachinefile << *it << ':' << parm.taskpernode << '\n'; ofmachinefile.close (); chown (machinefile.c_str(), uid, uid); chmod (machinefile.c_str(), s_irwxu|s_irwxg|s_irwxo); /* prepare redirection channels */ int ipipe [2] = {-1,-1}; int opipe [2] = {-1,-1}; if (parm.redio > 0) { if (pipe (ipipe) == -1) { unlink: unlink (machinefile.c_str()); unlink (scriptfile.c_str()); return; /* not fail job, try later */ } if (pipe (opipe) == -1) { close: close (ipipe [0]); close (ipipe [1]); goto unlink; } } /* ok, fork it! -----------------> */ pid_t pid; if ((pid = fork ()) == -1) { close (opipe [0]); close (opipe [1]); goto close; } if (pid == 0) { /* enable parent-tracing */ ptrace (ptrace_traceme, 0, null, null); /* drop root privilege */ setuid (uid); /* redirect stdin/stdout */ if (parm.redio) { if (dup2 (ipipe [0],0)<0 || dup2 (opipe [1],1)<0) exit (errno); close (ipipe [0]); close (ipipe [1]); close (opipe [0]); close (opipe [1]); } /* prepare arguments/environments */ char * arg[] = { strdup (scriptfile.c_str()), strdup (parm.args.c_str()), null /* required null entry */ }; setenv ("mpimachinefile", machinefile.c_str(), 1); setenv ("display", parm.headnode.c_str(), 1); setenv ("jsf_jobid", parm.name.c_str(), 1); /* execute it! ------> */ execv (scriptfile.c_str(), arg); exit (errno); } /* redirect stdin/stdout */ if (parm.redio) { close (ipipe [0]); close (opipe [1]); job.redpipe [0] = opipe [0]; job.redpipe [1] = ipipe [1]; } /* start nurse thread */ nursedata * nd = new nursedata (this, job); if (pthread_create (&job.nurseid, null, ::_jobnurse, nd) == 0) job.nurseactive = true; else delete nd; job.info.pid = pid; setjobstate (job, js2active); return; } void jsfnode::monitorjob (job2 & job) { int status; pid_t pid = waitpid (job.info.pid, &status, wnohang); if (pid < 0) { if (errno == echild) { /* job process has disappeared.. */ job.exitcode = 0; setjobstate (job, js2finished); return; } } else if (pid == job.info.pid) { if (wifexited(status)) { job.exitcode = wexitstatus(status); setjobstate (job, js2finished); return; } else if (wifsignaled(status)) { setjobstate (job, js2canceled); return; } else if (wifstopped(status)) { if (ptrace (ptrace_cont, pid, null, null)) jdebug ("ptrace_cont failed for: %d (%s)\n", pid, strerror(errno)); } } /* ... */ }
yes, problem results multi-threading. if monitorjob() runs in seperate thread, ptrace(ptrace_cont) fails. after moving main thread (the 1 called ptrace(ptrace_attach)), things go smoothly.
Comments
Post a Comment