Allow determinism checking for entire jobsets
Setting xxx-jobset-repeats = patchelf:master:2 will cause Hydra to perform every build step in the specified jobset 2 additional times (i.e. 3 times in total). Non-determinism is not fatal unless the derivation has the attribute "isDeterministic = true"; we just note the lack of determinism in the Hydra database. This will allow us to get stats about the (lack of) reproducibility of all of Nixpkgs.
This commit is contained in:
@ -116,7 +116,7 @@ static void copyClosureTo(ref<Store> destStore,
|
||||
|
||||
void State::buildRemote(ref<Store> destStore,
|
||||
Machine::ptr machine, Step::ptr step,
|
||||
unsigned int maxSilentTime, unsigned int buildTimeout,
|
||||
unsigned int maxSilentTime, unsigned int buildTimeout, unsigned int repeats,
|
||||
RemoteResult & result, std::shared_ptr<ActiveStep> activeStep)
|
||||
{
|
||||
assert(BuildResult::TimedOut == 8);
|
||||
@ -263,9 +263,10 @@ void State::buildRemote(ref<Store> destStore,
|
||||
to << maxSilentTime << buildTimeout;
|
||||
if (GET_PROTOCOL_MINOR(remoteVersion) >= 2)
|
||||
to << 64 * 1024 * 1024; // == maxLogSize
|
||||
if (GET_PROTOCOL_MINOR(remoteVersion) >= 3)
|
||||
// FIXME: make the number of repeats configurable.
|
||||
to << (step->isDeterministic ? 1 : 0);
|
||||
if (GET_PROTOCOL_MINOR(remoteVersion) >= 3) {
|
||||
to << repeats // == build-repeat
|
||||
<< step->isDeterministic; // == enforce-determinism
|
||||
}
|
||||
to.flush();
|
||||
|
||||
result.startTime = time(0);
|
||||
@ -295,6 +296,10 @@ void State::buildRemote(ref<Store> destStore,
|
||||
result.stepStatus = bsSuccess;
|
||||
} else {
|
||||
result.errorMsg = readString(from);
|
||||
if (GET_PROTOCOL_MINOR(remoteVersion) >= 3) {
|
||||
result.timesBuilt = readInt(from);
|
||||
result.isNonDeterministic = readInt(from);
|
||||
}
|
||||
switch ((BuildResult::Status) res) {
|
||||
case BuildResult::Built:
|
||||
result.stepStatus = bsSuccess;
|
||||
|
@ -86,6 +86,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
BuildID buildId;
|
||||
Path buildDrvPath;
|
||||
unsigned int maxSilentTime, buildTimeout;
|
||||
unsigned int repeats = step->isDeterministic ? 1 : 0;
|
||||
|
||||
{
|
||||
std::set<Build::ptr> dependents;
|
||||
@ -113,6 +114,11 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
build = build2;
|
||||
enqueueNotificationItem({NotificationItem::Type::BuildStarted, build->id});
|
||||
}
|
||||
{
|
||||
auto i = jobsetRepeats.find(std::make_pair(build2->projectName, build2->jobsetName));
|
||||
if (i != jobsetRepeats.end())
|
||||
repeats = std::max(repeats, i->second);
|
||||
}
|
||||
}
|
||||
if (!build) build = *dependents.begin();
|
||||
|
||||
@ -121,8 +127,8 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
maxSilentTime = build->maxSilentTime;
|
||||
buildTimeout = build->buildTimeout;
|
||||
|
||||
printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)")
|
||||
% step->drvPath % machine->sshName % buildId % (dependents.size() - 1));
|
||||
printInfo("performing step ‘%s’ %d times on ‘%s’ (needed by build %d and %d others)",
|
||||
step->drvPath, repeats + 1, machine->sshName, buildId, (dependents.size() - 1));
|
||||
}
|
||||
|
||||
bool quit = buildId == buildOne && step->drvPath == buildDrvPath;
|
||||
@ -162,7 +168,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
/* Do the build. */
|
||||
try {
|
||||
/* FIXME: referring builds may have conflicting timeouts. */
|
||||
buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result, activeStep);
|
||||
buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, repeats, result, activeStep);
|
||||
} catch (NoTokens & e) {
|
||||
result.stepStatus = bsNarSizeLimitExceeded;
|
||||
} catch (Error & e) {
|
||||
@ -224,8 +230,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
auto mc = startDbUpdate();
|
||||
{
|
||||
pqxx::work txn(*conn);
|
||||
finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, buildId,
|
||||
stepNr, machine->sshName, result.stepStatus, result.errorMsg);
|
||||
finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
|
||||
txn.commit();
|
||||
}
|
||||
stepFinished = true;
|
||||
@ -279,8 +284,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
|
||||
pqxx::work txn(*conn);
|
||||
|
||||
finishBuildStep(txn, result.startTime, result.stopTime, result.overhead,
|
||||
buildId, stepNr, machine->sshName, bsSuccess);
|
||||
finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
|
||||
|
||||
for (auto & b : direct) {
|
||||
printMsg(lvlInfo, format("marking build %1% as succeeded") % b->id);
|
||||
@ -386,8 +390,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||
|
||||
if (result.stepStatus != bsCachedFailure && !stepFinished) {
|
||||
assert(stepNr);
|
||||
finishBuildStep(txn, result.startTime, result.stopTime, result.overhead,
|
||||
buildId, stepNr, machine->sshName, result.stepStatus, result.errorMsg);
|
||||
finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
|
||||
}
|
||||
|
||||
/* Mark all builds that depend on this derivation as failed. */
|
||||
|
@ -264,20 +264,21 @@ unsigned int State::createBuildStep(pqxx::work & txn, time_t startTime, BuildID
|
||||
}
|
||||
|
||||
|
||||
void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, unsigned int overhead,
|
||||
BuildID buildId, unsigned int stepNr, const std::string & machine, BuildStatus status,
|
||||
const std::string & errorMsg, BuildID propagatedFrom)
|
||||
void State::finishBuildStep(pqxx::work & txn, const RemoteResult & result,
|
||||
BuildID buildId, unsigned int stepNr, const std::string & machine)
|
||||
{
|
||||
assert(startTime);
|
||||
assert(stopTime);
|
||||
assert(result.startTime);
|
||||
assert(result.stopTime);
|
||||
txn.parameterized
|
||||
("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, startTime = $6, stopTime = $7, machine = $8, overhead = $9 where build = $2 and stepnr = $3")
|
||||
((int) status)(buildId)(stepNr)
|
||||
(propagatedFrom, propagatedFrom != 0)
|
||||
(errorMsg, errorMsg != "")
|
||||
(startTime)(stopTime)
|
||||
("update BuildSteps set busy = 0, status = $1, errorMsg = $4, startTime = $5, stopTime = $6, machine = $7, overhead = $8, timesBuilt = $9, isNonDeterministic = $10 where build = $2 and stepnr = $3")
|
||||
((int) result.stepStatus)(buildId)(stepNr)
|
||||
(result.errorMsg, result.errorMsg != "")
|
||||
(result.startTime)(result.stopTime)
|
||||
(machine, machine != "")
|
||||
(overhead, overhead != 0).exec();
|
||||
(result.overhead, result.overhead != 0)
|
||||
(result.timesBuilt, result.timesBuilt > 0)
|
||||
(result.isNonDeterministic, result.timesBuilt > 1)
|
||||
.exec();
|
||||
}
|
||||
|
||||
|
||||
@ -809,6 +810,13 @@ void State::run(BuildID buildOne)
|
||||
|
||||
useSubstitutes = isTrue(hydraConfig["use-substitutes"]);
|
||||
|
||||
// FIXME: hacky mechanism for configuring determinism checks.
|
||||
for (auto & s : tokenizeString<Strings>(hydraConfig["xxx-jobset-repeats"])) {
|
||||
auto s2 = tokenizeString<std::vector<std::string>>(s, ":");
|
||||
if (s2.size() != 3) throw Error("bad value in xxx-jobset-repeats");
|
||||
jobsetRepeats.emplace(std::make_pair(s2[0], s2[1]), std::stoi(s2[2]));
|
||||
}
|
||||
|
||||
{
|
||||
auto conn(dbPool.get());
|
||||
clearBusy(*conn, 0);
|
||||
|
@ -48,6 +48,9 @@ struct RemoteResult
|
||||
bool canCache = false; // for bsFailed
|
||||
std::string errorMsg; // for bsAborted
|
||||
|
||||
unsigned int timesBuilt = 0;
|
||||
bool isNonDeterministic = false;
|
||||
|
||||
time_t startTime = 0, stopTime = 0;
|
||||
unsigned int overhead = 0;
|
||||
nix::Path logFile;
|
||||
@ -414,6 +417,10 @@ private:
|
||||
from showing up as busy until the queue runner is restarted. */
|
||||
nix::Sync<std::set<std::pair<BuildID, int>>> orphanedSteps;
|
||||
|
||||
/* How often the build steps of a jobset should be repeated in
|
||||
order to detect non-determinism. */
|
||||
std::map<std::pair<std::string, std::string>, unsigned int> jobsetRepeats;
|
||||
|
||||
public:
|
||||
State();
|
||||
|
||||
@ -437,10 +444,8 @@ private:
|
||||
const std::string & machine, BuildStatus status, const std::string & errorMsg = "",
|
||||
BuildID propagatedFrom = 0);
|
||||
|
||||
void finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime,
|
||||
unsigned int overhead, BuildID buildId, unsigned int stepNr,
|
||||
const std::string & machine, BuildStatus status, const std::string & errorMsg = "",
|
||||
BuildID propagatedFrom = 0);
|
||||
void finishBuildStep(pqxx::work & txn, const RemoteResult & result, BuildID buildId, unsigned int stepNr,
|
||||
const std::string & machine);
|
||||
|
||||
int createSubstitutionStep(pqxx::work & txn, time_t startTime, time_t stopTime,
|
||||
Build::ptr build, const nix::Path & drvPath, const std::string & outputName, const nix::Path & storePath);
|
||||
@ -492,6 +497,7 @@ private:
|
||||
void buildRemote(nix::ref<nix::Store> destStore,
|
||||
Machine::ptr machine, Step::ptr step,
|
||||
unsigned int maxSilentTime, unsigned int buildTimeout,
|
||||
unsigned int repeats,
|
||||
RemoteResult & result, std::shared_ptr<ActiveStep> activeStep);
|
||||
|
||||
void markSucceededBuild(pqxx::work & txn, Build::ptr build,
|
||||
|
Reference in New Issue
Block a user