ContentionStrategy.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.service.paxos;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;

import com.codahale.metrics.Snapshot;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.NoSpamLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.DoubleSupplier;
import java.util.function.LongBinaryOperator;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.lang.Double.parseDouble;
import static java.lang.Integer.parseInt;
import static java.lang.Math.*;
import static java.util.Arrays.stream;
import static java.util.concurrent.TimeUnit.*;
import static org.apache.cassandra.config.DatabaseDescriptor.*;
import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics;
import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casWriteMetrics;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
import static org.apache.cassandra.utils.Clock.waitUntil;

/**
 * <p>A strategy for making back-off decisions for Paxos operations that fail to make progress because of other paxos operations.
 * The strategy is defined by four factors: <ul>
 * <li> {@link #min}
 * <li> {@link #max}
 * <li> {@link #minDelta}
 * <li> {@link #waitRandomizer}
 * </ul>
 *
 * <p>The first three represent time periods, and may be defined dynamically based on a simple calculation over: <ul>
 * <li> {@code pX()} recent experienced latency distribution for successful operations,
 *                 e.g. {@code p50(rw)} the maximum of read and write median latencies,
 *                      {@code p999(r)} the 99.9th percentile of read latencies
 * <li> {@code attempts} the number of failed attempts made by the operation so far
 * <li> {@code constant} a user provided floating point constant
 * </ul>
 *
 * <p>Their calculation may take any of these forms
 * <li> constant            {@code $constant$[mu]s}
 * <li> dynamic constant    {@code pX() * constant}
 * <li> dynamic linear      {@code pX() * constant * attempts}
 * <li> dynamic exponential {@code pX() * constant ^ attempts}
 *
 * <p>Furthermore, the dynamic calculations can be bounded with a min/max, like so:
 *  {@code min[mu]s <= dynamic expr <= max[mu]s}
 *
 * e.g.
 * <li> {@code 10ms <= p50(rw)*0.66}
 * <li> {@code 10ms <= p95(rw)*1.8^attempts <= 100ms}
 * <li> {@code 5ms <= p50(rw)*0.5}
 *
 * <p>These calculations are put together to construct a range from which we draw a random number.
 * The period we wait for {@code X} will be drawn so that {@code min <= X < max}.
 *
 * <p>With the constraint that {@code max} must be {@code minDelta} greater than {@code min},
 * but no greater than its expression-defined maximum. {@code max} will be increased up until
 * this point, after which {@code min} will be decreased until this gap is imposed.
 *
 * <p>The {@link #waitRandomizer} property specifies the manner in which a random value is drawn from the range.
 * It is defined using one of the following specifiers:
 * <li> uniform
 * <li> exp($power$) or exponential($power$)
 * <li> qexp($power$) or qexponential($power$) or quantizedexponential($power$)
 *
 * The uniform specifier is self-explanatory, selecting all values in the range with equal probability.
 * The exponential specifier draws values towards the end of the range with higher probability, raising
 * a floating point number in the range [0..1.0) to the power provided, and translating the resulting value
 * to a uniform value in the range.
 * The quantized exponential specifier partitions the range into {@code attempts} buckets, then applies the pure
 * exponential approach to draw values from [0..attempts), before drawing a uniform value from the corresponding bucket
 *
 * <p>Finally, there is also a {@link #traceAfterAttempts} property that permits initiating tracing of operations
 * that experience a certain minimum number of failed paxos rounds due to contention. A setting of 0 or 1 will initiate
 * a trace session after the first failed ballot.
 */
public class ContentionStrategy
{
    private static final Logger logger = LoggerFactory.getLogger(ContentionStrategy.class);

    private static final Pattern BOUND = Pattern.compile(
                "(?<const>0|[0-9]+[mu]s)" +
                "|((?<min>0|[0-9]+[mu]s) *<= *)?" +
                    "(p(?<perc>[0-9]+)\\((?<rw>r|w|rw|wr)\\)|(?<constbase>0|[0-9]+[mu]s))" +
                    "\\s*([*]\\s*(?<mod>[0-9.]+)?\\s*(?<modkind>[*^]\\s*attempts)?)?" +
                "( *<= *(?<max>0|[0-9]+[mu]s))?");
    private static final Pattern TIME = Pattern.compile(
                "0|([0-9]+)ms|([0-9]+)us");
    private static final Pattern RANDOMIZER = Pattern.compile(
                "uniform|exp(onential)?[(](?<exp>[0-9.]+)[)]|q(uantized)?exp(onential)?[(](?<qexp>[0-9.]+)[)]");
    private static final String DEFAULT_WAIT_RANDOMIZER = "qexp(1.5)"; // at least 0ms, and at least 66% of median latency
    private static final String DEFAULT_MIN = "0 <= p50(rw)*0.66"; // at least 0ms, and at least 66% of median latency
    private static final String DEFAULT_MAX = "10ms <= p95(rw)*1.8^attempts <= 100ms"; // p95 latency with exponential back-off at rate of 1.8^attempts
    private static final String DEFAULT_MIN_DELTA = "5ms <= p50(rw)*0.5"; // at least 5ms, and at least 50% of median latency

    private static volatile ContentionStrategy current;

    // Factories can be useful for testing purposes, to supply custom implementations of selectors and modifiers.
    final static LatencySelectorFactory selectors = new LatencySelectorFactory(){};
    final static LatencyModifierFactory modifiers = new LatencyModifierFactory(){};
    final static WaitRandomizerFactory randomizers = new WaitRandomizerFactory(){};

    static
    {
        current = new ContentionStrategy(defaultWaitRandomizer(), defaultMinWait(), defaultMaxWait(), defaultMinDelta(), Integer.MAX_VALUE);
    }

    static interface LatencyModifierFactory
    {
        default LatencyModifier identity() { return (l, a) -> l; }
        default LatencyModifier multiply(double constant) { return (l, a) -> saturatedCast(l * constant); }
        default LatencyModifier multiplyByAttempts(double multiply) { return (l, a) -> saturatedCast(l * multiply * a); }
        default LatencyModifier multiplyByAttemptsExp(double base) { return (l, a) -> saturatedCast(l * pow(base, a)); }
    }

    static interface LatencySupplier
    {
        abstract long get(double percentile);
    }

    static interface LatencySelector
    {
        abstract long select(LatencySupplier readLatencyHistogram, LatencySupplier writeLatencyHistogram);
    }

    static interface LatencySelectorFactory
    {
        default LatencySelector constant(long latency) { return (read, write) -> latency; }
        default LatencySelector read(double percentile) { return (read, write) -> read.get(percentile); }
        default LatencySelector write(double percentile) { return (read, write) -> write.get(percentile); }
        default LatencySelector maxReadWrite(double percentile) { return (read, write) -> max(read.get(percentile), write.get(percentile)); }
    }

    static interface LatencyModifier
    {
        long modify(long latency, int attempts);
    }

    static interface WaitRandomizer
    {
        abstract long wait(long min, long max, int attempts);
    }

    static interface WaitRandomizerFactory
    {
        default LongBinaryOperator uniformLongSupplier() { return (min, max) -> ThreadLocalRandom.current().nextLong(min, max); } // DO NOT USE METHOD HANDLES (want to fetch afresh each time)
        default DoubleSupplier uniformDoubleSupplier() { return () -> ThreadLocalRandom.current().nextDouble(); }
        
        default WaitRandomizer uniform() { return new Uniform(uniformLongSupplier()); }
        default WaitRandomizer exponential(double power) { return new Exponential(uniformLongSupplier(), uniformDoubleSupplier(), power); }
        default WaitRandomizer quantizedExponential(double power) { return new QuantizedExponential(uniformLongSupplier(), uniformDoubleSupplier(), power); }

        static class Uniform implements WaitRandomizer
        {
            final LongBinaryOperator uniformLong;

            public Uniform(LongBinaryOperator uniformLong)
            {
                this.uniformLong = uniformLong;
            }

            @Override
            public long wait(long min, long max, int attempts)
            {
                return uniformLong.applyAsLong(min, max);
            }
        }

        static abstract class AbstractExponential implements WaitRandomizer
        {
            final LongBinaryOperator uniformLong;
            final DoubleSupplier uniformDouble;
            final double power;

            public AbstractExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power)
            {
                this.uniformLong = uniformLong;
                this.uniformDouble = uniformDouble;
                this.power = power;
            }
        }

        static class Exponential extends AbstractExponential
        {
            public Exponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power)
            {
                super(uniformLong, uniformDouble, power);
            }

            @Override
            public long wait(long min, long max, int attempts)
            {
                if (attempts == 1)
                    return uniformLong.applyAsLong(min, max);

                double p = uniformDouble.getAsDouble();
                long delta = max - min;
                delta *= Math.pow(p, power);
                return max - delta;
            }
        }

        static class QuantizedExponential extends AbstractExponential
        {
            public QuantizedExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power)
            {
                super(uniformLong, uniformDouble, power);
            }

            @Override
            public long wait(long min, long max, int attempts)
            {
                long quanta = (max - min) / attempts;
                if (attempts == 1 || quanta == 0)
                    return uniformLong.applyAsLong(min, max);

                double p = uniformDouble.getAsDouble();
                int base = (int) (attempts * Math.pow(p, power));
                return max - ThreadLocalRandom.current().nextLong(quanta * base, quanta * (base + 1));
            }
        }
    }

    static class SnapshotAndTime
    {
        final long validUntil;
        final Snapshot snapshot;

        SnapshotAndTime(long validUntil, Snapshot snapshot)
        {
            this.validUntil = validUntil;
            this.snapshot = snapshot;
        }
    }

    static class TimeLimitedLatencySupplier extends AtomicReference<SnapshotAndTime> implements LatencySupplier
    {
        final Supplier<Snapshot> snapshotSupplier;
        final long validForNanos;

        TimeLimitedLatencySupplier(Supplier<Snapshot> snapshotSupplier, long time, TimeUnit units)
        {
            this.snapshotSupplier = snapshotSupplier;
            this.validForNanos = units.toNanos(time);
        }

        private Snapshot getSnapshot()
        {
            long now = nanoTime();

            SnapshotAndTime cur = get();
            if (cur != null && cur.validUntil > now)
                return cur.snapshot;

            Snapshot newSnapshot = snapshotSupplier.get();
            SnapshotAndTime next = new SnapshotAndTime(now + validForNanos, newSnapshot);
            if (compareAndSet(cur, next))
                return next.snapshot;

            return accumulateAndGet(next, (a, b) -> a.validUntil > b.validUntil ? a : b).snapshot;
        }

        @Override
        public long get(double percentile)
        {
            return (long)getSnapshot().getValue(percentile);
        }
    }

    static class Bound
    {
        final long min, max, onFailure;
        final LatencyModifier modifier;
        final LatencySelector selector;
        final LatencySupplier reads, writes;

        Bound(long min, long max, long onFailure, LatencyModifier modifier, LatencySelector selector)
        {
            Preconditions.checkArgument(min<=max, "min (%s) must be less than or equal to max (%s)", min, max);
            this.min = min;
            this.max = max;
            this.onFailure = onFailure;
            this.modifier = modifier;
            this.selector = selector;
            this.reads = new TimeLimitedLatencySupplier(casReadMetrics.latency::getSnapshot, 10L, SECONDS);
            this.writes = new TimeLimitedLatencySupplier(casWriteMetrics.latency::getSnapshot, 10L, SECONDS);
        }

        long get(int attempts)
        {
            try
            {
                long base = selector.select(reads, writes);
                return max(min, min(max, modifier.modify(base, attempts)));
            }
            catch (Throwable t)
            {
                NoSpamLogger.getLogger(logger, 1L, MINUTES).info("", t);
                return onFailure;
            }
        }

        public String toString()
        {
            return "Bound{" +
                   "min=" + min +
                   ", max=" + max +
                   ", onFailure=" + onFailure +
                   ", modifier=" + modifier +
                   ", selector=" + selector +
                   '}';
        }
    }

    final WaitRandomizer waitRandomizer;
    final Bound min, max, minDelta;
    final int traceAfterAttempts;

    public ContentionStrategy(String waitRandomizer, String min, String max, String minDelta, int traceAfterAttempts)
    {
        this.waitRandomizer = parseWaitRandomizer(waitRandomizer);
        this.min = parseBound(min, true);
        this.max = parseBound(max, false);
        this.minDelta = parseBound(minDelta, true);
        this.traceAfterAttempts = traceAfterAttempts;
    }

    public enum Type
    {
        READ("Contended Paxos Read"), WRITE("Contended Paxos Write"), REPAIR("Contended Paxos Repair");

        final String traceTitle;
        final String lowercase;

        Type(String traceTitle)
        {
            this.traceTitle = traceTitle;
            this.lowercase = name().toLowerCase();
        }
    }

    long computeWaitUntilForContention(int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type)
    {
        if (attempts >= traceAfterAttempts && !Tracing.isTracing())
        {
            Tracing.instance.newSession(Tracing.TraceType.QUERY);
            Tracing.instance.begin(type.traceTitle,
                                   ImmutableMap.of(
                                       "keyspace", table.keyspace,
                                       "table", table.name,
                                       "partitionKey", table.partitionKeyType.getString(partitionKey.getKey()),
                                       "consistency", consistency.name(),
                                       "kind", type.lowercase
                                   ));

            logger.info("Tracing contended paxos {} for key {} on {}.{} with trace id {}",
                        type.lowercase,
                        ByteBufferUtil.bytesToHex(partitionKey.getKey()),
                        table.keyspace, table.name,
                        Tracing.instance.getSessionId());
        }

        long minWaitMicros = min.get(attempts);
        long maxWaitMicros = max.get(attempts);
        long minDeltaMicros = minDelta.get(attempts);

        if (minWaitMicros + minDeltaMicros > maxWaitMicros)
        {
            maxWaitMicros = minWaitMicros + minDeltaMicros;
            if (maxWaitMicros > this.max.max)
            {
                maxWaitMicros = this.max.max;
                minWaitMicros = max(this.min.min, min(this.min.max, maxWaitMicros - minDeltaMicros));
            }
        }

        long wait = waitRandomizer.wait(minWaitMicros, maxWaitMicros, attempts);
        return nanoTime() + MICROSECONDS.toNanos(wait);
    }

    boolean doWaitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type)
    {
        long until = computeWaitUntilForContention(attempts, table, partitionKey, consistency, type);
        if (until >= deadline)
            return false;

        try
        {
            waitUntil(until);
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            return false;
        }
        return true;
    }

    static boolean waitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type)
    {
        return current.doWaitForContention(deadline, attempts, table, partitionKey, consistency, type);
    }

    static long waitUntilForContention(int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type)
    {
        return current.computeWaitUntilForContention(attempts, table, partitionKey, consistency, type);
    }

    static class ParsedStrategy
    {
        final String waitRandomizer, min, max, minDelta;
        final ContentionStrategy strategy;

        ParsedStrategy(String waitRandomizer, String min, String max, String minDelta, ContentionStrategy strategy)
        {
            this.waitRandomizer = waitRandomizer;
            this.min = min;
            this.max = max;
            this.minDelta = minDelta;
            this.strategy = strategy;
        }
    }

    @VisibleForTesting
    static ParsedStrategy parseStrategy(String spec)
    {
        String[] args = spec.split(",");
        String waitRandomizer = find(args, "random");
        String min = find(args, "min");
        String max = find(args, "max");
        String minDelta = find(args, "delta");
        String trace = find(args, "trace");

        if (waitRandomizer == null) waitRandomizer = defaultWaitRandomizer();
        if (min == null) min = defaultMinWait();
        if (max == null) max = defaultMaxWait();
        if (minDelta == null) minDelta = defaultMinDelta();
        int traceAfterAttempts = trace == null ? current.traceAfterAttempts: Integer.parseInt(trace);

        ContentionStrategy strategy = new ContentionStrategy(waitRandomizer, min, max, minDelta, traceAfterAttempts);
        return new ParsedStrategy(waitRandomizer, min, max, minDelta, strategy);
    }


    public static void setStrategy(String spec)
    {
        ParsedStrategy parsed = parseStrategy(spec);
        current = parsed.strategy;
        setPaxosContentionWaitRandomizer(parsed.waitRandomizer);
        setPaxosContentionMinWait(parsed.min);
        setPaxosContentionMaxWait(parsed.max);
        setPaxosContentionMinDelta(parsed.minDelta);
    }

    public static String getStrategySpec()
    {
        return "min=" + defaultMinWait()
                + ",max=" + defaultMaxWait()
                + ",delta=" + defaultMinDelta()
                + ",random=" + defaultWaitRandomizer()
                + ",trace=" + current.traceAfterAttempts;
    }

    private static String find(String[] args, String param)
    {
        return stream(args).filter(s -> s.startsWith(param + '='))
                .map(s -> s.substring(param.length() + 1))
                .findFirst().orElse(null);
    }

    private static LatencySelector parseLatencySelector(Matcher m, LatencySelectorFactory selectors)
    {
        String perc = m.group("perc");
        if (perc == null)
            return selectors.constant(parseInMicros(m.group("constbase")));

        double percentile = parseDouble("0." + perc);
        String rw = m.group("rw");
        if (rw.length() == 2)
            return selectors.maxReadWrite(percentile);
        else if ("r".equals(rw))
            return selectors.read(percentile);
        else
            return selectors.write(percentile);
    }

    private static LatencyModifier parseLatencyModifier(Matcher m, LatencyModifierFactory modifiers)
    {
        String mod = m.group("mod");
        if (mod == null)
            return modifiers.identity();

        double modifier = parseDouble(mod);

        String modkind = m.group("modkind");
        if (modkind == null)
            return modifiers.multiply(modifier);

        if (modkind.startsWith("*"))
            return modifiers.multiplyByAttempts(modifier);
        else if (modkind.startsWith("^"))
            return modifiers.multiplyByAttemptsExp(modifier);
        else
            throw new IllegalArgumentException("Unrecognised attempt modifier: " + modkind);
    }

    static long saturatedCast(double v)
    {
        if (v > Long.MAX_VALUE)
            return Long.MAX_VALUE;
        return (long) v;
    }

    static WaitRandomizer parseWaitRandomizer(String input)
    {
        return parseWaitRandomizer(input, randomizers);
    }

    static WaitRandomizer parseWaitRandomizer(String input, WaitRandomizerFactory randomizers)
    {
        Matcher m = RANDOMIZER.matcher(input);
        if (!m.matches())
            throw new IllegalArgumentException(input + " does not match" + RANDOMIZER);

        String exp;
        exp = m.group("exp");
        if (exp != null)
            return randomizers.exponential(Double.parseDouble(exp));
        exp = m.group("qexp");
        if (exp != null)
            return randomizers.quantizedExponential(Double.parseDouble(exp));
        return randomizers.uniform();
    }

    static Bound parseBound(String input, boolean isMin)
    {
        return parseBound(input, isMin, selectors, modifiers);
    }

    @VisibleForTesting
    static Bound parseBound(String input, boolean isMin, LatencySelectorFactory selectors, LatencyModifierFactory modifiers)
    {
        Matcher m = BOUND.matcher(input);
        if (!m.matches())
            throw new IllegalArgumentException(input + " does not match " + BOUND);

        String maybeConst = m.group("const");
        if (maybeConst != null)
        {
            long v = parseInMicros(maybeConst);
            return new Bound(v, v, v, modifiers.identity(), selectors.constant(v));
        }

        long min = parseInMicros(m.group("min"), 0);
        long max = parseInMicros(m.group("max"), maxQueryTimeoutMicros() / 2);
        return new Bound(min, max, isMin ? min : max, parseLatencyModifier(m, modifiers), parseLatencySelector(m, selectors));
    }

    private static long parseInMicros(String input, long orElse)
    {
        if (input == null)
            return orElse;

        return parseInMicros(input);
    }

    private static long parseInMicros(String input)
    {
        Matcher m = TIME.matcher(input);
        if (!m.matches())
            throw new IllegalArgumentException(input + " does not match " + TIME);

        String text;
        if (null != (text = m.group(1)))
            return parseInt(text) * 1000;
        else if (null != (text = m.group(2)))
            return parseInt(text);
        else
            return 0;
    }

    @VisibleForTesting
    static String defaultWaitRandomizer()
    {
        return orElse(DatabaseDescriptor::getPaxosContentionWaitRandomizer, DEFAULT_WAIT_RANDOMIZER);
    }

    @VisibleForTesting
    static String defaultMinWait()
    {
        return orElse(DatabaseDescriptor::getPaxosContentionMinWait, DEFAULT_MIN);
    }

    @VisibleForTesting
    static String defaultMaxWait()
    {
        return orElse(DatabaseDescriptor::getPaxosContentionMaxWait, DEFAULT_MAX);
    }

    @VisibleForTesting
    static String defaultMinDelta()
    {
        return orElse(DatabaseDescriptor::getPaxosContentionMinDelta, DEFAULT_MIN_DELTA);
    }

    @VisibleForTesting
    static long maxQueryTimeoutMicros()
    {
        return max(max(getCasContentionTimeout(MICROSECONDS), getWriteRpcTimeout(MICROSECONDS)), getReadRpcTimeout(MICROSECONDS));
    }

    private static String orElse(Supplier<String> get, String orElse)
    {
        String result = get.get();
        return result != null ? result : orElse;
    }
}