hydra/src/script/hydra-queue-runner
Eelco Dolstra f592ce0026 Fix extreme slowness in hydra-queue-runner
If there are builds in the queue that depend on another scheduled
build, then hydra-queue-runner will start the dependency first and
block the dependent builds.  This is implemented in
findBuildDependencyInQueue.  However, if there are tens of thousands
of such dependent builds, since each call to
findBuildDependencyInQueue may take a second or so, hydra-queue-runner
will spend hours just deciding which builds *not* to do.  Thus very
little progress is made.

So now, when a build is started, we immediately check which builds are
"blocked" by it (i.e. depend on it), and remove such builds from
consideration.
2013-10-11 10:54:02 +02:00

273 lines
9.8 KiB
Perl
Executable File

#! /var/run/current-system/sw/bin/perl -w
use strict;
use Cwd;
use File::Basename;
use POSIX qw(dup2 :sys_wait_h);
use Hydra::Schema;
use Hydra::Helper::Nix;
use Hydra::Model::DB;
use IO::Handle;
use Nix::Store;
use Set::Scalar;
chdir Hydra::Model::DB::getHydraPath or die;
my $db = Hydra::Model::DB->new();
STDOUT->autoflush();
my $lastTime;
#$SIG{CHLD} = 'IGNORE';
sub unlockDeadBuilds {
# Unlock builds whose building process has died.
txn_do($db, sub {
my @builds = $db->resultset('Builds')->search({finished => 0, busy => 1});
foreach my $build (@builds) {
my $pid = $build->locker;
my $unlock = 0;
if ($pid == $$) {
if (!defined $lastTime || $build->starttime < $lastTime - 300) {
$unlock = 1;
}
} elsif (kill(0, $pid) != 1) { # see if we can signal the process
$unlock = 1;
}
if ($unlock) {
print "build ", $build->id, " pid $pid died, unlocking\n";
$build->update({ busy => 0, locker => "" });
$build->buildsteps->search({ busy => 1 })->update({ busy => 0, status => 4, stoptime => time });
}
}
});
}
# Given a build, return an arbitrary queued build on which this build
# depends; or undef if no such build exists.
sub findBuildDependencyInQueue {
my ($buildsByDrv, $build) = @_;
return undef unless isValidPath($build->drvpath);
my @deps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(0, 0, $build->drvpath);
return unless scalar @deps > 0;
foreach my $d (@deps) {
my $bs = $buildsByDrv->{$d};
next unless defined $bs;
return $db->resultset('Builds')->find((@$bs)[0]);
}
return undef;
}
sub blockBuilds {
my ($buildsByDrv, $blockedBuilds, $build) = @_;
my @rdeps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(1, 0, $build->drvpath);
foreach my $drv (@rdeps) {
my $bs = $buildsByDrv->{$drv};
next if !defined $bs;
$blockedBuilds->insert($_) foreach @$bs;
}
}
sub checkBuilds {
# print "looking for runnable builds...\n";
my @buildsStarted;
my $machines = getMachines;
my %maxConcurrent;
foreach my $machineName (keys %{$machines}) {
foreach my $system (@{${$machines}{$machineName}{'systemTypes'}}) {
$maxConcurrent{$system} = (${$machines}{$machineName}{'maxJobs'} or 0) + ($maxConcurrent{$system} or 0)
}
}
txn_do($db, sub {
# Cache scheduled builds by derivation path to speed up
# findBuildDependencyInQueue.
my $buildsByDrv = {};
push @{$buildsByDrv->{$_->drvpath}}, $_->id
foreach $db->resultset('Builds')->search({ finished => 0 });
# Builds in the queue of which a dependency is already building.
my $blockedBuilds = Set::Scalar->new();
blockBuilds($buildsByDrv, $blockedBuilds, $_)
foreach $db->resultset('Builds')->search({ finished => 0, busy => 1 });
# Get the system types for the runnable builds.
my @systemTypes = $db->resultset('Builds')->search(
{ finished => 0, busy => 0 },
{ join => ['project'], select => ['system'], as => ['system'], distinct => 1 });
# Get the total number of scheduling shares.
my $totalShares = getTotalShares($db);
# For each system type, select up to the maximum number of
# concurrent build for that system type.
foreach my $system (@systemTypes) {
# How many builds are already currently executing for this
# system type?
my $nrActive = $db->resultset('Builds')->search(
{finished => 0, busy => 1, system => $system->system})->count;
(my $systemTypeInfo) = $db->resultset('SystemTypes')->search({system => $system->system});
my $max = defined $systemTypeInfo ? $systemTypeInfo->maxconcurrent : $maxConcurrent{$system->system} // 2;
my $extraAllowed = $max - $nrActive;
next if $extraAllowed <= 0;
print STDERR "starting at most $extraAllowed builds for system ${\$system->system}\n";
my $timeSpentPerJobset;
j: while ($extraAllowed-- > 0) {
my @runnableJobsets = $db->resultset('Builds')->search(
{ finished => 0, busy => 0, system => $system->system },
{ select => ['project', 'jobset'], distinct => 1 });
next if @runnableJobsets == 0;
my $windowSize = 24 * 3600;
my $costPerBuild = 30;
my $totalWindowSize = $windowSize * $max;
my @res;
foreach my $b (@runnableJobsets) {
my $jobset = $db->resultset('Jobsets')->find($b->get_column('project'), $b->get_column('jobset')) or die;
my $timeSpent = $timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')};
if (!defined $timeSpent) {
$timeSpent = $jobset->builds->search(
{ },
{ where => \ ("(finished = 0 or (me.stoptime >= " . (time() - $windowSize) . "))")
, join => 'buildsteps'
, select => \ "sum(coalesce(buildsteps.stoptime, ${\time}) - buildsteps.starttime)"
, as => "sum" })->single->get_column("sum") // 0;
# Add a 30s penalty for each started build. This
# is to account for jobsets that have running
# builds but no build steps yet.
$timeSpent += $jobset->builds->search({ finished => 0, busy => 1 })->count * $costPerBuild;
$timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')} = $timeSpent;
}
my $share = $jobset->schedulingshares || 1; # prevent division by zero
my $used = $timeSpent / ($totalWindowSize * ($share / $totalShares));
#printf STDERR "%s:%s: %d s, total used = %.2f%%, share used = %.2f%%\n", $jobset->get_column('project'), $jobset->name, $timeSpent, $timeSpent / $totalWindowSize * 100, $used * 100;
push @res, { jobset => $jobset, used => $used };
}
foreach my $r (sort { $a->{used} <=> $b->{used} } @res) {
my $jobset = $r->{jobset};
#print STDERR "selected ", $jobset->get_column('project'), ':', $jobset->name, "\n";
# Select the highest-priority build for this jobset.
my @builds = $jobset->builds->search(
{ finished => 0, busy => 0, system => $system->system },
{ order_by => ["priority DESC", "id"] });
foreach my $build (@builds) {
next if $blockedBuilds->has($build->id);
# Find a dependency of $build that has no queued
# dependencies itself. This isn't strictly necessary,
# but it ensures that Nix builds are done as part of
# their corresponding Hydra builds, rather than as a
# dependency of some other Hydra build.
while (my $dep = findBuildDependencyInQueue($buildsByDrv, $build)) {
$build = $dep;
}
next if $build->busy;
printf STDERR "starting build %d (%s:%s:%s) on %s; jobset at %.2f%% of its share\n",
$build->id, $build->project->name, $build->jobset->name, $build->job->name, $build->system, $r->{used} * 100;
my $logfile = getcwd . "/logs/" . $build->id;
mkdir(dirname $logfile);
unlink($logfile);
$build->update(
{ busy => 1
, locker => $$
, logfile => $logfile
});
push @buildsStarted, $build;
$timeSpentPerJobset->{$jobset->get_column('project')}->{$jobset->name} += $costPerBuild;
blockBuilds($buildsByDrv, $blockedBuilds, $build);
next j;
}
}
last; # nothing found, give up on this system type
}
}
$lastTime = time();
$_->update({ starttime => time() }) foreach @buildsStarted;
});
# Actually start the builds we just selected. We need to do this
# outside the transaction in case it aborts or something.
foreach my $build (@buildsStarted) {
my $id = $build->id;
eval {
my $logfile = $build->logfile;
my $child = fork();
die unless defined $child;
if ($child == 0) {
eval {
open LOG, ">$logfile" or die "cannot create logfile $logfile";
POSIX::dup2(fileno(LOG), 1) or die;
POSIX::dup2(fileno(LOG), 2) or die;
exec("hydra-build", $id);
};
warn "cannot start build $id: $@";
POSIX::_exit(1);
}
};
if ($@) {
warn $@;
txn_do($db, sub {
$build->update({ busy => 0, locker => $$ });
});
}
}
}
if (scalar(@ARGV) == 1 && $ARGV[0] eq "--unlock") {
unlockDeadBuilds;
exit 0;
}
while (1) {
eval {
# Clean up zombies.
while ((waitpid(-1, &WNOHANG)) > 0) { };
unlockDeadBuilds;
checkBuilds;
};
warn $@ if $@;
# print "sleeping...\n";
sleep(5);
}