8303861: Error handling step timeouts should never be blocked by OnError and others

Reviewed-by: dholmes, rkennke
This commit is contained in:
Thomas Stuefe 2023-03-14 15:51:20 +00:00
parent da044dd569
commit a00f5d24d3
3 changed files with 92 additions and 36 deletions

View file

@ -270,8 +270,8 @@ void WatcherThread::run() {
os::die(); os::die();
} }
// Wait a second, then recheck for timeout. // Wait a bit, then recheck for timeout.
os::naked_short_sleep(999); os::naked_short_sleep(250);
} }
} }

View file

@ -1,6 +1,7 @@
/* /*
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017, 2020 SAP SE. All rights reserved. * Copyright (c) 2017, 2020 SAP SE. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -1744,43 +1745,68 @@ void VMError::show_message_box(char *buf, int buflen) {
} while (yes); } while (yes);
} }
// Timeout handling: check if a timeout happened (either a single step did // Fatal error handling is subject to several timeouts:
// timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt // - a global timeout (controlled via ErrorLogTimeout)
// the reporting thread if that is the case. // - local error reporting step timeouts.
//
// The latter aims to "give the JVM a kick" if it gets stuck in one particular place during
// error reporting. This prevents one error reporting step from hogging all the time allotted
// to error reporting under ErrorLogTimeout.
//
// VMError::check_timeout() is called from the watcher thread and checks for either global
// or step timeout. If a timeout happened, we interrupt the reporting thread and set either
// _reporting_did_timeout or _step_did_timeout to signal which timeout fired. Function returns
// true if the *global* timeout fired, which will cause WatcherThread to shut down the JVM
// immediately.
bool VMError::check_timeout() { bool VMError::check_timeout() {
// This function is supposed to be called from watcher thread during fatal error handling only.
assert(VMError::is_error_reported(), "Only call during error handling");
assert(Thread::current()->is_Watcher_thread(), "Only call from watcher thread");
if (ErrorLogTimeout == 0) { if (ErrorLogTimeout == 0) {
return false; return false;
} }
// Do not check for timeouts if we still have a message box to show to the // There are three situations where we suppress the *global* error timeout:
// user or if there are OnError handlers to be run. // - if the JVM is embedded and the launcher has its abort hook installed.
if (ShowMessageBoxOnError // That must be allowed to run.
|| (OnError != nullptr && OnError[0] != '\0') // - if the user specified one or more OnError commands to run, and these
|| Arguments::abort_hook() != nullptr) { // did not yet run. These must have finished.
return false; // - if the user (typically developer) specified ShowMessageBoxOnError,
} // and the error box has not yet been shown
const bool ignore_global_timeout =
(ShowMessageBoxOnError
|| (OnError != nullptr && OnError[0] != '\0')
|| Arguments::abort_hook() != nullptr);
const jlong reporting_start_time_l = get_reporting_start_time();
const jlong now = get_current_timestamp(); const jlong now = get_current_timestamp();
// Timestamp is stored in nanos.
if (reporting_start_time_l > 0) { // Global timeout hit?
const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR; if (!ignore_global_timeout) {
if (end <= now && !_reporting_did_timeout) { const jlong reporting_start_time = get_reporting_start_time();
// We hit ErrorLogTimeout and we haven't interrupted the reporting // Timestamp is stored in nanos.
// thread yet. if (reporting_start_time > 0) {
_reporting_did_timeout = true; const jlong end = reporting_start_time + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
interrupt_reporting_thread(); if (end <= now && !_reporting_did_timeout) {
return true; // global timeout // We hit ErrorLogTimeout and we haven't interrupted the reporting
// thread yet.
_reporting_did_timeout = true;
interrupt_reporting_thread();
return true; // global timeout
}
} }
} }
const jlong step_start_time_l = get_step_start_time(); // Reporting step timeout?
if (step_start_time_l > 0) { const jlong step_start_time = get_step_start_time();
if (step_start_time > 0) {
// A step times out after a quarter of the total timeout. Steps are mostly fast unless they // A step times out after a quarter of the total timeout. Steps are mostly fast unless they
// hang for some reason, so this simple rule allows for three hanging step and still // hang for some reason, so this simple rule allows for three hanging step and still
// hopefully leaves time enough for the rest of the steps to finish. // hopefully leaves time enough for the rest of the steps to finish.
const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4; const int max_step_timeout_secs = 5;
const jlong timeout_duration = MAX2((jlong)max_step_timeout_secs, (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4);
const jlong end = step_start_time + timeout_duration;
if (end <= now && !_step_did_timeout) { if (end <= now && !_step_did_timeout) {
// The step timed out and we haven't interrupted the reporting // The step timed out and we haven't interrupted the reporting
// thread yet. // thread yet.

View file

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2017, 2022 SAP SE. All rights reserved. * Copyright (c) 2017, 2022 SAP SE. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
@ -26,6 +27,9 @@ import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import jdk.test.lib.process.OutputAnalyzer; import jdk.test.lib.process.OutputAnalyzer;
@ -33,7 +37,7 @@ import jdk.test.lib.Platform;
import jdk.test.lib.process.ProcessTools; import jdk.test.lib.process.ProcessTools;
/* /*
* @test * @test id=default
* @bug 8166944 * @bug 8166944
* @summary Hanging Error Reporting steps may lead to torn error logs * @summary Hanging Error Reporting steps may lead to torn error logs
* @modules java.base/jdk.internal.misc * @modules java.base/jdk.internal.misc
@ -43,6 +47,16 @@ import jdk.test.lib.process.ProcessTools;
* @author Thomas Stuefe (SAP) * @author Thomas Stuefe (SAP)
*/ */
/*
* @test id=with-on-error
* @bug 8303861
* @summary Error handling step timeouts should never be blocked by OnError etc.
* @modules java.base/jdk.internal.misc
* @library /test/lib
* @requires (vm.debug == true) & (os.family != "windows")
* @run driver TimeoutInErrorHandlingTest with-on-error
*/
public class TimeoutInErrorHandlingTest { public class TimeoutInErrorHandlingTest {
public static final boolean verbose = System.getProperty("verbose") != null; public static final boolean verbose = System.getProperty("verbose") != null;
@ -71,14 +85,28 @@ public class TimeoutInErrorHandlingTest {
* little timeout messages to see that repeated timeout handling is basically working. * little timeout messages to see that repeated timeout handling is basically working.
*/ */
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder( boolean withOnError = false;
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M", if (args.length > 0) {
"-XX:ErrorHandlerTest=14", switch (args[0]) {
"-XX:+TestUnresponsiveErrorHandler", case "with-on-error": withOnError = true; break;
"-XX:ErrorLogTimeout=" + ERROR_LOG_TIMEOUT, default: throw new RuntimeException("Invalid argument " + args[1]);
"-XX:-CreateCoredumpOnCrash", }
"-version"); }
List<String> arguments = new ArrayList<>();
Collections.addAll(arguments,
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M",
"-XX:ErrorHandlerTest=14",
"-XX:+TestUnresponsiveErrorHandler",
"-XX:ErrorLogTimeout=" + ERROR_LOG_TIMEOUT,
"-XX:-CreateCoredumpOnCrash");
if (withOnError) {
arguments.add("-XX:OnError=echo hi");
}
arguments.add("-version");
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(arguments);
OutputAnalyzer output_detail = new OutputAnalyzer(pb.start()); OutputAnalyzer output_detail = new OutputAnalyzer(pb.start());
@ -92,8 +120,10 @@ public class TimeoutInErrorHandlingTest {
output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*"); output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*");
output_detail.shouldMatch("# +(?:SIGSEGV|SIGBUS|EXCEPTION_ACCESS_VIOLATION).*"); output_detail.shouldMatch("# +(?:SIGSEGV|SIGBUS|EXCEPTION_ACCESS_VIOLATION).*");
// VM should have been aborted by WatcherThread // Unless we specified OnError, VM should have been aborted by WatcherThread
output_detail.shouldMatch(".*timer expired, abort.*"); if (!withOnError) {
output_detail.shouldMatch(".*timer expired, abort.*");
}
// extract hs-err file // extract hs-err file
File hs_err_file; File hs_err_file;