If there are builds in the queue that depend on another scheduled build, then hydra-queue-runner will start the dependency first and block the dependent builds. This is implemented in findBuildDependencyInQueue. However, if there are tens of thousands of such dependent builds, since each call to findBuildDependencyInQueue may take a second or so, hydra-queue-runner will spend hours just deciding which builds *not* to do. Thus very little progress is made. So now, when a build is started, we immediately check which builds are "blocked" by it (i.e. depend on it), and remove such builds from consideration.
273 lines
9.8 KiB
Perl
Executable File
273 lines
9.8 KiB
Perl
Executable File
#! /var/run/current-system/sw/bin/perl -w
|
|
|
|
use strict;
|
|
use Cwd;
|
|
use File::Basename;
|
|
use POSIX qw(dup2 :sys_wait_h);
|
|
use Hydra::Schema;
|
|
use Hydra::Helper::Nix;
|
|
use Hydra::Model::DB;
|
|
use IO::Handle;
|
|
use Nix::Store;
|
|
use Set::Scalar;
|
|
|
|
chdir Hydra::Model::DB::getHydraPath or die;
|
|
my $db = Hydra::Model::DB->new();
|
|
|
|
STDOUT->autoflush();
|
|
|
|
my $lastTime;
|
|
|
|
#$SIG{CHLD} = 'IGNORE';
|
|
|
|
|
|
sub unlockDeadBuilds {
|
|
# Unlock builds whose building process has died.
|
|
txn_do($db, sub {
|
|
my @builds = $db->resultset('Builds')->search({finished => 0, busy => 1});
|
|
foreach my $build (@builds) {
|
|
my $pid = $build->locker;
|
|
my $unlock = 0;
|
|
if ($pid == $$) {
|
|
if (!defined $lastTime || $build->starttime < $lastTime - 300) {
|
|
$unlock = 1;
|
|
}
|
|
} elsif (kill(0, $pid) != 1) { # see if we can signal the process
|
|
$unlock = 1;
|
|
}
|
|
if ($unlock) {
|
|
print "build ", $build->id, " pid $pid died, unlocking\n";
|
|
$build->update({ busy => 0, locker => "" });
|
|
$build->buildsteps->search({ busy => 1 })->update({ busy => 0, status => 4, stoptime => time });
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
|
|
# Given a build, return an arbitrary queued build on which this build
|
|
# depends; or undef if no such build exists.
|
|
sub findBuildDependencyInQueue {
|
|
my ($buildsByDrv, $build) = @_;
|
|
return undef unless isValidPath($build->drvpath);
|
|
my @deps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(0, 0, $build->drvpath);
|
|
return unless scalar @deps > 0;
|
|
foreach my $d (@deps) {
|
|
my $bs = $buildsByDrv->{$d};
|
|
next unless defined $bs;
|
|
return $db->resultset('Builds')->find((@$bs)[0]);
|
|
}
|
|
return undef;
|
|
}
|
|
|
|
|
|
sub blockBuilds {
|
|
my ($buildsByDrv, $blockedBuilds, $build) = @_;
|
|
my @rdeps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(1, 0, $build->drvpath);
|
|
foreach my $drv (@rdeps) {
|
|
my $bs = $buildsByDrv->{$drv};
|
|
next if !defined $bs;
|
|
$blockedBuilds->insert($_) foreach @$bs;
|
|
}
|
|
}
|
|
|
|
|
|
sub checkBuilds {
|
|
# print "looking for runnable builds...\n";
|
|
|
|
my @buildsStarted;
|
|
|
|
my $machines = getMachines;
|
|
|
|
my %maxConcurrent;
|
|
|
|
foreach my $machineName (keys %{$machines}) {
|
|
foreach my $system (@{${$machines}{$machineName}{'systemTypes'}}) {
|
|
$maxConcurrent{$system} = (${$machines}{$machineName}{'maxJobs'} or 0) + ($maxConcurrent{$system} or 0)
|
|
}
|
|
}
|
|
|
|
txn_do($db, sub {
|
|
|
|
# Cache scheduled builds by derivation path to speed up
|
|
# findBuildDependencyInQueue.
|
|
my $buildsByDrv = {};
|
|
push @{$buildsByDrv->{$_->drvpath}}, $_->id
|
|
foreach $db->resultset('Builds')->search({ finished => 0 });
|
|
|
|
# Builds in the queue of which a dependency is already building.
|
|
my $blockedBuilds = Set::Scalar->new();
|
|
blockBuilds($buildsByDrv, $blockedBuilds, $_)
|
|
foreach $db->resultset('Builds')->search({ finished => 0, busy => 1 });
|
|
|
|
# Get the system types for the runnable builds.
|
|
my @systemTypes = $db->resultset('Builds')->search(
|
|
{ finished => 0, busy => 0 },
|
|
{ join => ['project'], select => ['system'], as => ['system'], distinct => 1 });
|
|
|
|
# Get the total number of scheduling shares.
|
|
my $totalShares = getTotalShares($db);
|
|
|
|
# For each system type, select up to the maximum number of
|
|
# concurrent build for that system type.
|
|
foreach my $system (@systemTypes) {
|
|
# How many builds are already currently executing for this
|
|
# system type?
|
|
my $nrActive = $db->resultset('Builds')->search(
|
|
{finished => 0, busy => 1, system => $system->system})->count;
|
|
|
|
(my $systemTypeInfo) = $db->resultset('SystemTypes')->search({system => $system->system});
|
|
my $max = defined $systemTypeInfo ? $systemTypeInfo->maxconcurrent : $maxConcurrent{$system->system} // 2;
|
|
|
|
my $extraAllowed = $max - $nrActive;
|
|
next if $extraAllowed <= 0;
|
|
|
|
print STDERR "starting at most $extraAllowed builds for system ${\$system->system}\n";
|
|
|
|
my $timeSpentPerJobset;
|
|
|
|
j: while ($extraAllowed-- > 0) {
|
|
|
|
my @runnableJobsets = $db->resultset('Builds')->search(
|
|
{ finished => 0, busy => 0, system => $system->system },
|
|
{ select => ['project', 'jobset'], distinct => 1 });
|
|
|
|
next if @runnableJobsets == 0;
|
|
|
|
my $windowSize = 24 * 3600;
|
|
my $costPerBuild = 30;
|
|
my $totalWindowSize = $windowSize * $max;
|
|
|
|
my @res;
|
|
|
|
foreach my $b (@runnableJobsets) {
|
|
my $jobset = $db->resultset('Jobsets')->find($b->get_column('project'), $b->get_column('jobset')) or die;
|
|
|
|
my $timeSpent = $timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')};
|
|
|
|
if (!defined $timeSpent) {
|
|
$timeSpent = $jobset->builds->search(
|
|
{ },
|
|
{ where => \ ("(finished = 0 or (me.stoptime >= " . (time() - $windowSize) . "))")
|
|
, join => 'buildsteps'
|
|
, select => \ "sum(coalesce(buildsteps.stoptime, ${\time}) - buildsteps.starttime)"
|
|
, as => "sum" })->single->get_column("sum") // 0;
|
|
|
|
# Add a 30s penalty for each started build. This
|
|
# is to account for jobsets that have running
|
|
# builds but no build steps yet.
|
|
$timeSpent += $jobset->builds->search({ finished => 0, busy => 1 })->count * $costPerBuild;
|
|
|
|
$timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')} = $timeSpent;
|
|
}
|
|
|
|
my $share = $jobset->schedulingshares || 1; # prevent division by zero
|
|
my $used = $timeSpent / ($totalWindowSize * ($share / $totalShares));
|
|
|
|
#printf STDERR "%s:%s: %d s, total used = %.2f%%, share used = %.2f%%\n", $jobset->get_column('project'), $jobset->name, $timeSpent, $timeSpent / $totalWindowSize * 100, $used * 100;
|
|
|
|
push @res, { jobset => $jobset, used => $used };
|
|
}
|
|
|
|
foreach my $r (sort { $a->{used} <=> $b->{used} } @res) {
|
|
my $jobset = $r->{jobset};
|
|
#print STDERR "selected ", $jobset->get_column('project'), ':', $jobset->name, "\n";
|
|
|
|
# Select the highest-priority build for this jobset.
|
|
my @builds = $jobset->builds->search(
|
|
{ finished => 0, busy => 0, system => $system->system },
|
|
{ order_by => ["priority DESC", "id"] });
|
|
|
|
foreach my $build (@builds) {
|
|
next if $blockedBuilds->has($build->id);
|
|
|
|
# Find a dependency of $build that has no queued
|
|
# dependencies itself. This isn't strictly necessary,
|
|
# but it ensures that Nix builds are done as part of
|
|
# their corresponding Hydra builds, rather than as a
|
|
# dependency of some other Hydra build.
|
|
while (my $dep = findBuildDependencyInQueue($buildsByDrv, $build)) {
|
|
$build = $dep;
|
|
}
|
|
next if $build->busy;
|
|
|
|
printf STDERR "starting build %d (%s:%s:%s) on %s; jobset at %.2f%% of its share\n",
|
|
$build->id, $build->project->name, $build->jobset->name, $build->job->name, $build->system, $r->{used} * 100;
|
|
|
|
my $logfile = getcwd . "/logs/" . $build->id;
|
|
mkdir(dirname $logfile);
|
|
unlink($logfile);
|
|
$build->update(
|
|
{ busy => 1
|
|
, locker => $$
|
|
, logfile => $logfile
|
|
});
|
|
push @buildsStarted, $build;
|
|
|
|
$timeSpentPerJobset->{$jobset->get_column('project')}->{$jobset->name} += $costPerBuild;
|
|
|
|
blockBuilds($buildsByDrv, $blockedBuilds, $build);
|
|
|
|
next j;
|
|
}
|
|
}
|
|
|
|
last; # nothing found, give up on this system type
|
|
}
|
|
}
|
|
|
|
$lastTime = time();
|
|
|
|
$_->update({ starttime => time() }) foreach @buildsStarted;
|
|
});
|
|
|
|
# Actually start the builds we just selected. We need to do this
|
|
# outside the transaction in case it aborts or something.
|
|
foreach my $build (@buildsStarted) {
|
|
my $id = $build->id;
|
|
eval {
|
|
my $logfile = $build->logfile;
|
|
my $child = fork();
|
|
die unless defined $child;
|
|
if ($child == 0) {
|
|
eval {
|
|
open LOG, ">$logfile" or die "cannot create logfile $logfile";
|
|
POSIX::dup2(fileno(LOG), 1) or die;
|
|
POSIX::dup2(fileno(LOG), 2) or die;
|
|
exec("hydra-build", $id);
|
|
};
|
|
warn "cannot start build $id: $@";
|
|
POSIX::_exit(1);
|
|
}
|
|
};
|
|
if ($@) {
|
|
warn $@;
|
|
txn_do($db, sub {
|
|
$build->update({ busy => 0, locker => $$ });
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (scalar(@ARGV) == 1 && $ARGV[0] eq "--unlock") {
|
|
unlockDeadBuilds;
|
|
exit 0;
|
|
}
|
|
|
|
|
|
while (1) {
|
|
eval {
|
|
# Clean up zombies.
|
|
while ((waitpid(-1, &WNOHANG)) > 0) { };
|
|
|
|
unlockDeadBuilds;
|
|
|
|
checkBuilds;
|
|
};
|
|
warn $@ if $@;
|
|
|
|
# print "sleeping...\n";
|
|
sleep(5);
|
|
}
|