Skip to content

Commit

Permalink
Merge pull request #1213 from SpiNNakerManchester/fix_bmp_issue
Browse files Browse the repository at this point in the history
Fix bmp issue
  • Loading branch information
rowleya authored Jan 16, 2025
2 parents 68a25eb + 1221e7b commit ecc96b3
Show file tree
Hide file tree
Showing 16 changed files with 291 additions and 26 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,5 @@ jobs:
run: mvn apache-rat:check --settings $SETTINGS_FILE -V || (find . -type f -name 'rat*.txt' -print | xargs grep -l unapproved | xargs cat; exit 1)
- name: "Check: CITATION.cff validity"
uses: dieghernan/cff-validator@v3
with:
install-r: true
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,5 @@ nbactions.xml

# Output of shade plugin
dependency-reduced-pom.xml

/src/support/eclipse/
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,13 @@ private void updateJobNow(int jobId, JobState sourceState,

private boolean update(int jobId, JobState sourceState,
JobState targetState, Connection c) {
log.debug("Updating job {} from {} to {}", jobId, sourceState,
targetState);
try (var getChangeStatus = c.query(COUNT_CHANGES_FOR_JOB);
var setJobState = c.update(SET_STATE_PENDING);
var setJobDestroyed = c.update(SET_STATE_DESTROYED);
var deleteChanges = c.update(DELETE_PENDING)) {
var deleteChanges = c.update(DELETE_PENDING);
var deleteTask = c.update(DELETE_TASK)) {
// Count pending changes for this state change
var status = getChangeStatus.call1(ChangeStatus::new,
jobId, sourceState, targetState).orElseThrow(
Expand Down Expand Up @@ -234,13 +237,23 @@ private boolean update(int jobId, JobState sourceState,
// If there are no more pending changes and no errors,
// set the job state to the target state
log.debug("Job {} moving to state {}", jobId, targetState);

// If destroyed, we don't need to set the state as it will be gone
if (targetState == DESTROYED) {
int rows = setJobDestroyed.call(jobId);
if (rows != 1) {
log.warn("unexpected number of rows affected by "
+ "destroy in state update: {}", rows);
}
return rows > 0;

// If ready, we can now delete the job request too
} else if (targetState == READY) {
int rows = deleteTask.call(jobId);
if (rows != 1) {
log.warn("unexpected number of rows affected by ready in "
+ "state update: {}", rows);
}
}

return setJobState.call(targetState, jobId) > 0;
Expand Down Expand Up @@ -387,9 +400,6 @@ private final class AllocSQL extends PowerSQL {
/** Get the list of allocation tasks for jobs in a given state. */
private final Query getTasks;

/** Delete an allocation task. */
private final Update delete;

/** Find a single free board. */
private final Query findFreeBoard;

Expand Down Expand Up @@ -439,7 +449,6 @@ private final class AllocSQL extends PowerSQL {
super(conn);
bumpImportance = conn.update(BUMP_IMPORTANCE);
getTasks = conn.query(getAllocationTasks);
delete = conn.update(DELETE_TASK);
findFreeBoard = conn.query(FIND_FREE_BOARD);
getRectangles = conn.query(findRectangle);
getRectangleAt = conn.query(findRectangleAt);
Expand All @@ -455,7 +464,6 @@ public void close() {
super.close();
bumpImportance.close();
getTasks.close();
delete.close();
findFreeBoard.close();
getRectangles.close();
getRectangleAt.close();
Expand Down Expand Up @@ -642,7 +650,9 @@ private Allocations allocate(Connection conn) {
int maxImportance = -1;
log.trace("Allocate running");
var allocations = new Allocations();
for (AllocTask task : sql.getTasks.call(AllocTask::new, QUEUED)) {
var tasks = sql.getTasks.call(AllocTask::new, QUEUED);
log.debug("allocate for {} tasks", tasks.size());
for (AllocTask task : tasks) {
if (task.importance > maxImportance) {
maxImportance = task.importance;
} else if (task.importance < maxImportance
Expand All @@ -651,10 +661,6 @@ private Allocations allocate(Connection conn) {
continue;
}
var handled = task.allocate(sql);
// If we handled it, delete the request
if (handled.size() > 0) {
sql.delete.call(task.id);
}
allocations.addAll(task.jobId, handled);
log.debug("allocate for {} (job {}): {}", task.id,
task.jobId, handled);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,12 @@ public interface TestAPI {
*/
void prepare();

/**
* Reset the transceivers stored in the workers after installing a new
* transceiver.
*/
void resetTransceivers();

/**
* The core of the scheduler.
*
Expand Down Expand Up @@ -1253,6 +1259,13 @@ public void prepare() {
makeWorkers();
}

@Override
public void resetTransceivers() {
for (var worker : workers.values()) {
worker.control = null;
}
}

@Override
public void processRequests(long millis, Collection<Integer> bmps)
throws IOException, SpinnmanException,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ public Blacklist getCurrentBlacklist() {
@Override
public void setFactory(TestTransceiverFactory factory) {
testFactory = factory;
synchronized (txrxMap) {
txrxMap.clear();
}
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -660,9 +660,9 @@ public abstract class SQLQueries {
+ "WHERE board_id = :board_id";

/** Delete an allocation task. */
@Parameter("request_id")
@Parameter("job_id")
protected static final String DELETE_TASK =
"DELETE FROM job_request WHERE req_id = :request_id";
"DELETE FROM job_request WHERE job_id = :job_id";

/** Find a single free board. */
@Parameter("machine_id")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public static final class ViewFactory {
* @param viewName
* The name of the view that this class will make.
*/
public ViewFactory(@CompileTimeConstant String viewName) {
public ViewFactory(@CompileTimeConstant final String viewName) {
view = viewName;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ public interface SupportQueries {
"SELECT COUNT(*) AS cnt FROM job_request "
+ "JOIN jobs USING (job_id) WHERE job_state = :job_state";

/** Count the active requests for a job. */
@Parameter("job_id")
@ResultColumn("cnt")
@SingleRowResult
String TEST_COUNT_REQUESTS_FOR_JOB =
"SELECT COUNT(*) AS cnt FROM job_request WHERE job_id = :job_id";

/** Count the active power change requests. */
@ResultColumn("cnt")
@SingleRowResult
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
/*
* Copyright (c) 2025 The University of Manchester
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.manchester.spinnaker.alloc.allocator;

import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import static uk.ac.manchester.spinnaker.alloc.model.JobState.READY;
import static uk.ac.manchester.spinnaker.alloc.model.JobState.QUEUED;
import static uk.ac.manchester.spinnaker.alloc.model.JobState.DESTROYED;

import java.io.IOException;
import java.util.List;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.test.context.TestPropertySource;
import org.springframework.test.context.junit.jupiter.web.SpringJUnitWebConfig;

import uk.ac.manchester.spinnaker.alloc.TestSupport;
import uk.ac.manchester.spinnaker.alloc.allocator.AllocatorTask.TestAPI;
import uk.ac.manchester.spinnaker.alloc.bmp.BMPController;
import uk.ac.manchester.spinnaker.alloc.bmp.MockFailTransceiver;
import uk.ac.manchester.spinnaker.alloc.bmp.MockTransceiver;
import uk.ac.manchester.spinnaker.alloc.bmp.TransceiverFactory;
import uk.ac.manchester.spinnaker.alloc.model.JobState;

@SpringBootTest
@SpringJUnitWebConfig(TestSupport.Config.class)
@ActiveProfiles("unittest")
@TestPropertySource(properties = {
// These tests sometimes hold transactions for a long time; this is OK
"spalloc.sqlite.lock-note-threshold=2200ms",
"spalloc.sqlite.lock-warn-threshold=3s"
})
public class AllocatorFailTest extends TestSupport {

@Autowired
private AllocatorTask alloc;

private BMPController.TestAPI bmpCtrl;

private TransceiverFactory txrxFactory;

@BeforeEach
@SuppressWarnings("deprecation")
void checkSetup(@Autowired TransceiverFactory txrxFactory,
@Autowired BMPController bmpCtrl) throws IOException {

assumeTrue(db != null, "spring-configured DB engine absent");
killDB();
setupDB3();
this.txrxFactory = txrxFactory;
this.bmpCtrl = bmpCtrl.getTestAPI();
this.bmpCtrl.prepare();
this.bmpCtrl.clearBmpException();
}

@Test
public void testBoardFail() {
// This is messier; can't have a transaction open and roll it back
try (var c = db.getConnection()) {
this.conn = c;
int job = c.transaction(() -> makeQueuedJob(100000));
try {
log.info("Created job {}", job);
makeAllocBySizeRequest(job, 1);
var allocations = c.transaction(() -> {
return getAllocTester().allocate();
});

var boards = c.query(GET_JOB_BOARDS).call(row -> {
return row.getInt("board_id");
}, job);
assertEquals(1, boards.size());
var firstBoard = boards.get(0);
log.info("Allocated board: {}", firstBoard);

// Check the board last power on time
var powerOffTime = c.query(GET_BOARD_POWER_INFO).call1(row -> {
return row.getLong("power_off_timestamp");
}, firstBoard);
log.info("Power off time: {}", powerOffTime);

// Make a transceiver that will fail the allocation
MockFailTransceiver.installIntoFactory(txrxFactory);
allocations.updateBMPs();
snooze1s();
snooze1s();

// By now it should have all failed, so the boards should
// no longer be allocated
var jobDeallocatedBoards = c.query(GET_JOB_BOARDS).call(row -> {
return row.getInt("board_id");
}, job);
assertEquals(0, jobDeallocatedBoards.size(),
"Boards should be unallocated, but got: "
+ jobDeallocatedBoards);

// The failed board should have powered off again so new time
var newPowerOffTime = c.query(GET_BOARD_POWER_INFO).call1(row -> {
return row.getLong("power_off_timestamp");
}, firstBoard);
log.info("New Power off time: {}", newPowerOffTime);
assertNotEquals(powerOffTime, newPowerOffTime);

// The job should be queued again, with a single request
assertState(job, QUEUED, 1, 0);

// Make a transceiver that *doesn't* fail the requests
MockTransceiver.installIntoFactory(txrxFactory);
this.bmpCtrl.resetTransceivers();

// So we should be able to reallocate the job to a new board
var newAllocations = c.transaction(() -> {
return getAllocTester().allocate();
});
var newBoards = c.query(GET_JOB_BOARDS).call(row -> {
return row.getInt("board_id");
}, job);
assertEquals(1, newBoards.size());
var newFirstBoard = newBoards.get(0);
log.info("Allocated new board: {}", newFirstBoard);

// That board should be different
assertNotEquals(firstBoard, newFirstBoard);

// And now lets see if it works...
newAllocations.updateBMPs();
snooze1s();
snooze1s();
assertState(job, READY, 0, 0);

// We should also now not have any job requests
var requestCount = c.query(TEST_COUNT_REQUESTS_FOR_JOB).call1(
row -> {return row.getInt("cnt");}, job).get();
assertEquals(0, requestCount);
} finally {
log.info("Finishing test by deleting things");
c.transaction(() -> {
// Reset the state
getAllocTester().destroyJob(job, "test");
c.update("DELETE FROM job_request").call();
c.update("DELETE FROM pending_changes").call();
c.update("UPDATE boards SET allocated_job = NULL, "
+ "power_on_timestamp = 0, "
+ "power_off_timestamp = 0").call();
});
}
}
}

@SuppressWarnings("deprecation")
private TestAPI getAllocTester() {
return alloc.getTestAPI(conn);
}

private void assertState(int jobId, JobState state, int requestCount,
int powerCount) {
var expected = List.of(state, "req", requestCount, "power", powerCount);
var got = List.of(getJobState(jobId), "req", getJobRequestCount(),
"power", getPendingPowerChanges());
assertEquals(expected, got);
}
}
Loading

0 comments on commit ecc96b3

Please sign in to comment.