author = {Károly Bósa},
title = {{Fault Tolerance for Distributed Maple}},
language = {english},
abstract = {Distributed Maple is a Java-based system for implementing in distributed environments parallel programs in the computer algebra system Maple. It has evolved from Dr. Wolfgang Schreiner's experience in the development of parallel computer algebra environments and from learning from the work of other researchers. As the problems to which the system was applied became more and more complex, the meantime between session failures became a limiting factor of the applicability of the system. However, the fact that the parallel programming model of the system is basically functional gave the chance to develop new fault tolerance mechanisms for Distributed Maple which are more effective than existing solutions targeted to general parallel applications (like checkpointing). In this thesis, we present and describe how we have extended Distributed Maple with fault tolerance such that the time spent in a long running computation is not any more wasted by the eventual occurrence of a failure. First we introduced a mechanism for the logging of task return values and of shared object values such that after a failure a newly started session can (transparently to the application program) reuse already computed results. Then we concentrate on node failures and permanent connection failures. We implemented some new mechanisms by which a session is able to tolerate connection and node failures (even if the root node fails) without overall failure and continue normal operation. Furthermore, the system periodically attempts to restart the failed nodes and to reestablish the broken connections. Together these fault tolerance mechanisms allow to run computations that take much longer than the meantime between session failures. With these developments, Distributed Maple is by far the most advanced system for computer algebra concerning reliability in distributed environments.},
year = {2004},
month = {September},
translation = {0},
school = {RISC-Linz, Johannes Kepler University, Linz, Austria},
keywords = {distributed systems, fault tolerance, computer algebra},
length = {116}