diff --git a/.gitignore b/.gitignore index ed55c2091..74b4f37c6 100644 --- a/.gitignore +++ b/.gitignore @@ -197,7 +197,9 @@ nativebin/ # JetBrains cs/.idea/ -cs/remote/.idea -cs/libdpr/.idea +cs/remote/.idea/ +cs/research/libdpr/.idea/ +cs/libdpr/.idea/ +cs/research/darq/.idea/ cs/**/BenchmarkDotNet.Artifacts/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..4ac67238c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build-env +WORKDIR /app + +# Build Travel Reservation +COPY ./cs . +WORKDIR /app/research/darq/TravelReservation +RUN dotnet restore +RUN dotnet build -c Release -o out + +# Build EventProcessing +WORKDIR /app/research/darq/EventProcessing +RUN dotnet restore +RUN dotnet publish -c Release -o out + +# Build CoordinatorMicrobench +WORKDIR /app/research/darq/CoordinatorMicrobench +RUN dotnet restore +RUN dotnet publish -c Release -o out + +FROM mcr.microsoft.com/dotnet/aspnet:7.0 +WORKDIR /app +COPY --from=build-env /app/TravelReservation-latency ./TravelReservation-latency +COPY --from=build-env /app/TravelReservation-thr ./TravelReservation-thr +COPY --from=build-env /app/EventProcessing ./EventProcessing-latency +COPY --from=build-env /app/research/darq/TravelReservation/out ./TravelReservation +COPY --from=build-env /app/research/darq/EventProcessing/out ./EventProcessing +COPY --from=build-env /app/research/darq/CoordinatorMicrobench/out ./CoordinatorMicrobench +EXPOSE 4022 \ No newline at end of file diff --git a/cs/playground/FasterLogMLSDTest/Program.cs b/cs/playground/FasterLogMLSDTest/Program.cs index 474c728a1..847c802f8 100644 --- a/cs/playground/FasterLogMLSDTest/Program.cs +++ b/cs/playground/FasterLogMLSDTest/Program.cs @@ -4,7 +4,9 @@ using System; using System.Diagnostics; using System.IO; +using System.Linq; using System.Threading; +using System.Threading.Tasks; using FASTER.core; namespace FasterLogStress @@ -16,27 +18,55 @@ public class Program static readonly byte[] entry = new byte[100]; private static string commitPath; - public static void Main() + private static byte[] buffer; + + public static async Task Main() { - commitPath = "FasterLogStress/"; + using var settings = new FasterLogSettings("./Test", deleteDirOnDispose: true) + { + AutoRefreshSafeTailAddress = true + }; + log = new FasterLog(settings); - // Clean up log files from previous test runs in case they weren't cleaned up - // We loop to ensure clean-up as deleteOnClose does not always work for MLSD - while (Directory.Exists(commitPath)) - Directory.Delete(commitPath, true); + buffer = new byte[2048]; + Random.Shared.NextBytes(buffer); - // Create devices \ log for test - device = new ManagedLocalStorageDevice(commitPath + "ManagedLocalStore.log", deleteOnClose: true); - log = new FasterLog(new FasterLogSettings { LogDevice = device, PageSizeBits = 12, MemorySizeBits = 14 }); + await Task.WhenAll(EnqueueThread(), ScanThread()); + log.Dispose(); + } + + static async Task EnqueueThread() + { + for (int count = 0; count < 5; ++count) + { + await log.EnqueueAsync(buffer); + await Task.Delay(1000); + } - ManagedLocalStoreBasicTest(); + log.CompleteLog(); + Console.WriteLine("Enqueue complete"); + } - log.Dispose(); - device.Dispose(); + static async Task ScanThread() + { + using var iterator = log.Scan(log.BeginAddress, long.MaxValue, scanUncommitted: true); + while (true) + { + byte[] result; + while (!iterator.GetNext(out result, out _, out _)) + { + if (iterator.Ended) + { + Console.WriteLine("Scan complete"); + return; + } - // Clean up log files - if (Directory.Exists(commitPath)) - Directory.Delete(commitPath, true); + await iterator.WaitAsync(); + } + + Console.WriteLine("Received buffer"); + Debug.Assert(result.SequenceEqual(buffer)); + } } diff --git a/cs/remote/src/FASTER.common/ElasticCircularBuffer.cs b/cs/remote/src/FASTER.common/ElasticCircularBuffer.cs index 85762e5d7..cdc63c720 100644 --- a/cs/remote/src/FASTER.common/ElasticCircularBuffer.cs +++ b/cs/remote/src/FASTER.common/ElasticCircularBuffer.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Runtime.CompilerServices; using System.Runtime.Serialization; +using System.Threading; namespace FASTER.common { @@ -92,7 +93,8 @@ public sealed class ElasticCircularBuffer : IEnumerable private readonly LinkedList> buffers; private LinkedListNode> head; private LinkedListNode> tail; - + private int count; + /// /// Constructor /// @@ -103,6 +105,9 @@ public ElasticCircularBuffer() buffers.AddFirst(node); tail = head = node; } + + public int ApproxCount => count; + /// /// Enqueue @@ -111,6 +116,8 @@ public ElasticCircularBuffer() [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Enqueue(ref T value) { + Interlocked.Increment(ref count); + if (tail.Value.IsFull()) { tail.Value.Sealed = true; @@ -158,6 +165,7 @@ public T Dequeue() if (head == null) head = buffers.First; temp.Value.Sealed = false; } + Interlocked.Decrement(ref count); return head.Value.Dequeue(); } diff --git a/cs/remote/src/FASTER.common/HeaderReaderWriter.cs b/cs/remote/src/FASTER.common/HeaderReaderWriter.cs index 9a4f1625a..805f4f018 100644 --- a/cs/remote/src/FASTER.common/HeaderReaderWriter.cs +++ b/cs/remote/src/FASTER.common/HeaderReaderWriter.cs @@ -34,7 +34,22 @@ public unsafe int ReadPendingSeqNo(ref byte* dst) public unsafe bool Write(MessageType s, ref byte* dst, int length) { if (length < 1) return false; - *dst++ = (byte)s; + *dst++ = (byte) s; + return true; + } + + /// + /// Write message type to memory + /// + /// Message type + /// Destination memory + /// Length of destination + /// Whether write succeeded + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe bool Write(byte s, ref byte* dst, int length) + { + if (length < 1) return false; + *dst++ = s; return true; } diff --git a/cs/remote/src/FASTER.common/INetworkSender.cs b/cs/remote/src/FASTER.common/INetworkSender.cs index 2b0ddeac0..903b0a822 100644 --- a/cs/remote/src/FASTER.common/INetworkSender.cs +++ b/cs/remote/src/FASTER.common/INetworkSender.cs @@ -54,12 +54,7 @@ public interface INetworkSender : IDisposable /// /// Send response (caller owns buffer space) /// - void SendResponse(byte[] buffer, int offset, int count, object context); - - /// - /// Send response (caller owns buffer space) - /// - void SendCallback(object context); + void SendResponse(byte[] buffer, int offset, int count, Action sendCallback); /// /// Dispose, optionally waiting for ongoing outgoing calls to complete diff --git a/cs/remote/src/FASTER.common/NetworkSenderBase.cs b/cs/remote/src/FASTER.common/NetworkSenderBase.cs index 4b2d6aef1..dff3acd31 100644 --- a/cs/remote/src/FASTER.common/NetworkSenderBase.cs +++ b/cs/remote/src/FASTER.common/NetworkSenderBase.cs @@ -1,4 +1,6 @@ -namespace FASTER.common +using System; + +namespace FASTER.common { /// /// NetworkSenderBase class @@ -57,12 +59,12 @@ public NetworkSenderBase(int serverBufferSize) /// public abstract bool SendResponse(int offset, int size); - + /// public abstract void SendResponse(byte[] buffer, int offset, int count, object context); /// - public abstract void SendCallback(object context); + public abstract void SendResponse(byte[] buffer, int offset, int count, Action sendCallback); /// public abstract void Dispose(); diff --git a/cs/remote/src/FASTER.common/SimpleObjectPool.cs b/cs/remote/src/FASTER.common/SimpleObjectPool.cs index 9d8bf94ee..e789ae13a 100644 --- a/cs/remote/src/FASTER.common/SimpleObjectPool.cs +++ b/cs/remote/src/FASTER.common/SimpleObjectPool.cs @@ -2,7 +2,6 @@ // Licensed under the MIT license. using System; -using System.Diagnostics; using System.Runtime.CompilerServices; using System.Threading; @@ -12,20 +11,29 @@ namespace FASTER.common /// Object pool /// /// - public class SimpleObjectPool : IDisposable where T : class, IDisposable + public class SimpleObjectPool : IDisposable where T : class { private readonly Func factory; + private readonly Action destructor; private readonly LightConcurrentStack stack; private int allocatedObjects; /// /// Constructor /// - /// - /// - public SimpleObjectPool(Func factory, int maxObjects = 128) + /// method used to create new objects of type T + /// + /// Max number of objects that will be retained and recycled in this object pool. + /// Objects exceeding this count are created and destroyed on demand + /// + /// method used to dispose retained objects when they go out of scope. + /// WARNING: NOT invoked on retained objects before reuse + /// + + public SimpleObjectPool(Func factory, int maxObjects = 128, Action destructor = null) { this.factory = factory; + this.destructor = destructor; stack = new LightConcurrentStack(maxObjects); allocatedObjects = 0; } @@ -39,7 +47,7 @@ public void Dispose() { while (stack.TryPop(out var elem)) { - elem.Dispose(); + destructor?.Invoke(elem); Interlocked.Decrement(ref allocatedObjects); } Thread.Yield(); @@ -69,7 +77,7 @@ public void Return(T obj) { if (!stack.TryPush(obj)) { - obj.Dispose(); + destructor?.Invoke(obj); Interlocked.Decrement(ref allocatedObjects); } } diff --git a/cs/remote/src/FASTER.common/TcpNetworkSender.cs b/cs/remote/src/FASTER.common/TcpNetworkSender.cs index 3aefa435d..392c133e6 100644 --- a/cs/remote/src/FASTER.common/TcpNetworkSender.cs +++ b/cs/remote/src/FASTER.common/TcpNetworkSender.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. +using System; using System.Net; using System.Net.Sockets; using System.Runtime.CompilerServices; @@ -22,6 +23,9 @@ public class TcpNetworkSender : NetworkSenderBase /// Response object /// protected SeaaBuffer responseObject; + + // For use when user invokes send variant with user-owned buffers + private SimpleObjectPool saeaPool = new(() => new SocketAsyncEventArgs()); /// /// Reusable SeaaBuffer @@ -58,7 +62,8 @@ public TcpNetworkSender( : base(maxSizeSettings) { this.socket = socket; - this.reusableSeaaBuffer = new SimpleObjectPool(() => new SeaaBuffer(SeaaBuffer_Completed, this.serverBufferSize)); + this.reusableSeaaBuffer = new SimpleObjectPool(() => new SeaaBuffer(SeaaBuffer_Completed, + this.serverBufferSize), 128, s => s.Dispose()); this.responseObject = null; this.ThrottleMax = throttleMax; @@ -82,7 +87,8 @@ public TcpNetworkSender( : base(serverBufferSize) { this.socket = socket; - this.reusableSeaaBuffer = new SimpleObjectPool(() => new SeaaBuffer(SeaaBuffer_Completed, this.serverBufferSize)); + this.reusableSeaaBuffer = new SimpleObjectPool(() => new SeaaBuffer(SeaaBuffer_Completed, + this.serverBufferSize), 128, s => s.Dispose()); this.responseObject = null; this.ThrottleMax = throttleMax; @@ -154,15 +160,32 @@ public override bool SendResponse(int offset, int size) } /// - public override void SendResponse(byte[] buffer, int offset, int count, object context) + public override void SendResponse(byte[] buffer, int offset, int count, Action sendCallback) { - throw new System.NotImplementedException(); + var saea = saeaPool.Checkout(); + saea.SetBuffer(buffer, offset, count); + saea.UserToken = sendCallback; + saea.Completed += SaeaBuffer_Completed; + + if (Interlocked.Increment(ref throttleCount) > ThrottleMax) + throttle.Wait(); + if (!socket.SendAsync(saea)) + SaeaBuffer_Completed(null, saea); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void SaeaBuffer_Completed(object sender, SocketAsyncEventArgs e) + { + ((Action)e.UserToken)(); + saeaPool.Return(e); + if (Interlocked.Decrement(ref throttleCount) >= ThrottleMax) + throttle.Release(); } /// - public override void SendCallback(object context) + public override void SendResponse(byte[] buffer, int offset, int count, object context) { - throw new System.NotImplementedException(); + throw new NotImplementedException(); } /// diff --git a/cs/remote/src/FASTER.common/ThreadLocalObjectPool.cs b/cs/remote/src/FASTER.common/ThreadLocalObjectPool.cs new file mode 100644 index 000000000..490468be6 --- /dev/null +++ b/cs/remote/src/FASTER.common/ThreadLocalObjectPool.cs @@ -0,0 +1,63 @@ +using System; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.common +{ + + /// + /// An object pool that optimizes for use cases where objects are reused on the same thread. Thread-safe. + /// There will be memory utilization problems if checkout and return pairs are not called on the same thread. + /// + /// + public class ThreadLocalObjectPool : IDisposable where T : class + { + // TODO(Tianyu): Replace with FastThreadLocal if performance is an issue + private readonly ThreadLocal> objects; + + /// + /// Constructs a new object pool + /// + /// method used to create new objects of type T + /// + /// Max number of objects that will be retained and recycled in this object pool per thread. + /// Objects exceeding this count are created and destroyed on demand + /// + /// method used to dispose retained objects when they go out of scope. WARNING: NOT invoked on retained objects before reuse + public ThreadLocalObjectPool(Func factory, int maxObjectPerThread = 128, Action destructor = null) + { + objects = new ThreadLocal>( + () => new SimpleObjectPool(factory, maxObjectPerThread, destructor), true); + } + + /// + /// Dispose + /// + public void Dispose() + { + foreach (var pool in objects.Values) + pool.Dispose(); + } + + + /// + /// Gets a new (reused) object + /// + /// object of type T + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public T Checkout() + { + return objects.Value.Checkout(); + } + + /// + /// Returns a used object for future use + /// + /// object to return + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Return(T obj) + { + objects.Value.Return(obj); + } + } +} \ No newline at end of file diff --git a/cs/remote/src/FASTER.common/WireFormat.cs b/cs/remote/src/FASTER.common/WireFormat.cs index 593730981..e863b7082 100644 --- a/cs/remote/src/FASTER.common/WireFormat.cs +++ b/cs/remote/src/FASTER.common/WireFormat.cs @@ -23,7 +23,6 @@ public enum WireFormat : byte /// Similar to DefaultVarLenKV but with WebSocket headers (binary) /// WebSocket = 2, - /// /// ASCII wire format (non-binary protocol) /// diff --git a/cs/remote/src/FASTER.server/ISessionProvider.cs b/cs/remote/src/FASTER.server/ISessionProvider.cs index 5f6941c5f..897ef3b7d 100644 --- a/cs/remote/src/FASTER.server/ISessionProvider.cs +++ b/cs/remote/src/FASTER.server/ISessionProvider.cs @@ -14,7 +14,7 @@ public interface ISessionProvider /// Given messages of wire format type and a networkSender, returns a session that handles that wire format. If no provider is configured /// for the given wire format, an exception is thrown. /// - /// Wire format + /// Wire format byte /// Socket connection /// Server session IMessageConsumer GetSession(WireFormat wireFormat, INetworkSender networkSender); diff --git a/cs/research/darq/CoordinatorMicrobench/CoordinatorMicrobench.csproj b/cs/research/darq/CoordinatorMicrobench/CoordinatorMicrobench.csproj new file mode 100644 index 000000000..0b831b77f --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/CoordinatorMicrobench.csproj @@ -0,0 +1,36 @@ + + + + Exe + net7.0 + enable + enable + + + + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + Server,Client + Public + True + True + obj\Release\net7.0\ + MSBuild:Compile + + + + diff --git a/cs/research/darq/CoordinatorMicrobench/Program.cs b/cs/research/darq/CoordinatorMicrobench/Program.cs new file mode 100644 index 000000000..188e237ae --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/Program.cs @@ -0,0 +1,188 @@ + +using System.Diagnostics; +using System.Net; +using Azure.Storage.Blobs; +using CommandLine; +using FASTER.core; +using FASTER.libdpr; +using FASTER.libdpr.proto; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; + +namespace microbench; + +public class Options +{ + [Option('t', "type", Required = true, + HelpText = "type of worker to launch")] + public string Type { get; set; } + + [Option('o', "output-file", Required = false, + HelpText = "Name of file to output")] + public string OutputFile { get; set; } + + [Option('n', "num-workers", Required = false, Default = 1, + HelpText = "number of workers to simulate in total in a pod")] + public int NumWorkers { get; set; } + + [Option('i', "pod-id", Required = false, + HelpText = "id of the client being launched")] + public int PodId { get; set; } + + [Option('p', "num-pods", Required = false, Default = 8, + HelpText = "number of pods")] + public int NumPods { get; set; } + + [Option('d', "dep-prob", Required = false, Default = 0.2, + HelpText = "probability of taking on a dependency")] + public double DependencyProbability { get; set; } + + [Option('c', "checkpoint-interval", Required = false, Default = 10, + HelpText = "checkpoint interval")] + public int CheckpointInterval { get; set; } +} + +public class StatsAggregationServiceImpl : StatsAggregationService.StatsAggregationServiceBase +{ + private int toReport, toSynchronize; + private TaskCompletionSource completionTcs = new(), synchronizationTcs = new(); + private List measurements = new(); + private Action> outputAction; + public StatsAggregationServiceImpl(int numPods, Action> outputAction) + { + toSynchronize = toReport = numPods; + this.outputAction = outputAction; + } + + public override async Task ReportResults(ReportResultsMessage request, ServerCallContext context) + { + lock (this) + { + measurements.AddRange(request.Latencies); + if (--toReport == 0) + { + outputAction(measurements); + completionTcs.SetResult(); + } + } + + await completionTcs.Task; + return new ReportResultsMessage(); + } + + public override async Task Synchronize(SynchronizeRequest request, ServerCallContext context) + { + if (Interlocked.Decrement(ref toSynchronize) == 0) + synchronizationTcs.SetResult(); + await synchronizationTcs.Task; + return new SynchronizeResponse(); + } +} + +public class Program +{ + public static async Task Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + + switch (options.Type.Trim()) + { + case "worker": + await LaunchBenchmarkClient(options); + break; + case "dprfinder": + await LaunchDprFinder(options); + break; + default: + throw new NotImplementedException(); + } + } + + private static Task LaunchBenchmarkClient(Options options) + { + var workers = new List(); + for (var i = 0; i < options.NumPods * options.NumWorkers; i++) + workers.Add(new DprWorkerId(i)); + + var toSimulate = new List(); + for (var i = 0; i < options.NumWorkers; i++) + toSimulate.Add(new DprWorkerId(i * options.NumPods + options.PodId)); + + var channel = GrpcChannel.ForAddress("http://dprfinder.dse.svc.cluster.local:15721"); + // var channel = GrpcChannel.ForAddress("http://127.0.0.1:15721"); + + var finder = new GrpcDprFinder(channel); + var worker = new SimulatedDprWorker(finder, new UniformWorkloadGenerator(options.DependencyProbability), workers, toSimulate); + var client = new StatsAggregationService.StatsAggregationServiceClient(channel); + client.Synchronize(new SynchronizeRequest()); + worker.RunContinuously(30, options.CheckpointInterval); + var results = new ReportResultsMessage(); + results.Latencies.AddRange(worker.ComputeVersionCommitLatencies()); + client.ReportResults(results); + + return Task.CompletedTask; + } + + public static async Task LaunchDprFinder(Options options) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, 15721, + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + + using var device1 = new LocalMemoryDevice(1 << 25, 1 << 25, 1); + using var device2 = new LocalMemoryDevice(1 << 25, 1 << 25, 1); + var device = new PingPongDevice(device1, device2); + + builder.Services.AddSingleton(device); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + var aggregation = new StatsAggregationServiceImpl(options.NumPods, measurements => + { + // foreach (var line in measurements) + // Console.WriteLine(line * 1000.0 / Stopwatch.Frequency); + using var memoryStream = new MemoryStream(); + using var streamWriter = new StreamWriter(memoryStream); + foreach (var line in measurements) + streamWriter.WriteLine(line * 1000.0 / Stopwatch.Frequency); + streamWriter.Flush(); + memoryStream.Position = 0; + + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + blobContainerClient.CreateIfNotExists(); + var blobClient = blobContainerClient.GetBlobClient(options.OutputFile); + blobClient.Upload(memoryStream, overwrite: true); + }); + + builder.Services.AddSingleton(aggregation); + + builder.Services.AddGrpc(); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGrpcService(); + + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } +} \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/SimulatedDprWorker.cs b/cs/research/darq/CoordinatorMicrobench/SimulatedDprWorker.cs new file mode 100644 index 000000000..27b461a06 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/SimulatedDprWorker.cs @@ -0,0 +1,83 @@ +using System.Diagnostics; +using FASTER.common; +using FASTER.libdpr; + +namespace microbench; + +public class SimulatedDprWorker +{ + private IDprFinder dprFinder; + private IWorkloadGenerator generator; + private List workers; + private List toSimulate; + + private SimpleObjectPool> pool = new(() => new List(10)); + + private Dictionary versionPersistent = new(), versionRecoverable = new(); + private Stopwatch stopwatch; + + public SimulatedDprWorker(IDprFinder dprFinder, IWorkloadGenerator generator, + List workers, List toSimulate) + { + foreach (var w in toSimulate) + dprFinder.AddWorker(w, Enumerable.Empty>); + this.dprFinder = dprFinder; + this.generator = generator; + this.workers = workers; + this.toSimulate = toSimulate; + } + + public List ComputeVersionCommitLatencies() + { + var result = new List(versionRecoverable.Count); + + foreach (var entry in versionRecoverable) + { + if (!versionPersistent.TryGetValue(entry.Key, out var startTime)) continue; + result.Add(entry.Value - startTime); + } + + return result; + } + + public void RunContinuously(int runSeconds, int checkpointMilli) + { + var currentVersion = 1L; + var safeVersions = new Dictionary(); + foreach (var w in toSimulate) + safeVersions[w] = 0; + stopwatch = Stopwatch.StartNew(); + while (stopwatch.ElapsedMilliseconds < runSeconds * 1000) + { + var elapsed = stopwatch.ElapsedMilliseconds; + var currentTime = stopwatch.ElapsedTicks; + dprFinder.RefreshStateless(); + foreach (var w in toSimulate) + { + var currentSafeVersion = dprFinder.SafeVersion(w); + for (var i = safeVersions[w] + 1; i <= currentSafeVersion; i++) + versionRecoverable.Add(new WorkerVersion(w, i), currentTime); + safeVersions[w] = currentSafeVersion; + } + + var expectedVersion = 1 + elapsed / checkpointMilli; + if (expectedVersion > currentVersion) + { + foreach (var w in toSimulate) + { + var wv = new WorkerVersion(w, currentVersion); + var deps = pool.Checkout(); + generator.GenerateDependenciesOneRun(workers, w, currentVersion, deps); + versionPersistent.Add(wv, currentTime); + Task.Run(() => + { + dprFinder.ReportNewPersistentVersion(1, wv, deps); + pool.Return(deps); + }); + } + + currentVersion = expectedVersion; + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/WorkloadGenerator.cs b/cs/research/darq/CoordinatorMicrobench/WorkloadGenerator.cs new file mode 100644 index 000000000..88e9c3e21 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/WorkloadGenerator.cs @@ -0,0 +1,34 @@ +using FASTER.libdpr; + +namespace microbench; + +public interface IWorkloadGenerator +{ + void GenerateDependenciesOneRun(IList workers, DprWorkerId me, long currentVersion, List output); +} + +public class UniformWorkloadGenerator : IWorkloadGenerator +{ + private Random rand = new(); + private double depProb; + + public UniformWorkloadGenerator(double depProb) + { + this.depProb = depProb; + } + + public void GenerateDependenciesOneRun(IList workers, DprWorkerId me, + long currentVersion, List output) + { + output.Clear(); + if (currentVersion != 1) + output.Add(new WorkerVersion(me, currentVersion - 1)); + foreach (var worker in workers) + { + if (worker.Equals(me)) continue; + + if (rand.NextDouble() < depProb) + output.Add(new WorkerVersion(worker, currentVersion)); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/finder.proto b/cs/research/darq/CoordinatorMicrobench/finder.proto new file mode 100644 index 000000000..913efbde4 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/finder.proto @@ -0,0 +1,25 @@ +syntax = "proto3"; + +option csharp_namespace = "FASTER.libdpr.proto"; + +message ReportResultsMessage { + repeated int64 latencies = 1; +} + +message ReportResultsResponse { + +} + +message SynchronizeRequest { + +} + +message SynchronizeResponse { + +} + +service StatsAggregationService { + rpc ReportResults(ReportResultsMessage) returns (ReportResultsMessage); + rpc Synchronize(SynchronizeRequest) returns (SynchronizeResponse); + +} \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/Chart.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/Chart.yaml new file mode 100644 index 000000000..279950e87 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: reservation-workload +description: Helm chart for TravelReservation workload (clients, services, and orchestrator) +type: application +version: 0.1.0 +appVersion: "1.16.0" \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-deployment.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-deployment.yaml new file mode 100644 index 000000000..118eacc34 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + replicas: 1 + selector: + matchLabels: + service: dprfinder + strategy: {} + template: + metadata: + labels: + service: dprfinder + spec: + priorityClassName: high-priority + containers: + - command: + - "CoordinatorMicrobench/CoordinatorMicrobench" + args: + - "-t" + - "dprfinder" + - "-o" + - "coordinator-microbench-result-n{{ .Values.num_workers }}-d{{ .Values.dep_prob}}.csv" + image: tianyuli96/faster:latest + name: dprfinder + ports: + - containerPort: 15721 + - containerPort: 4022 + envFrom: + - configMapRef: + name: env-config + nodeSelector: + nodepool: dsebench + restartPolicy: Always \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-service.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-service.yaml new file mode 100644 index 000000000..5268bde33 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/dprfinder-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + ports: + - name: "15721" + port: 15721 + targetPort: 15721 + selector: + service: dprfinder diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/env-config.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/env-config.yaml new file mode 100644 index 000000000..a876ec967 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/env-config.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: env-config + namespace: dse +data: + AZURE_CONN_STRING: "{{ .Values.conn_string }}" + AZURE_RESULTS_CONN_STRING: "{{ .Values.results_conn_string }}" \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/service-role-auth.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/service-role-auth.yaml new file mode 100644 index 000000000..ff978ccc0 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/service-role-auth.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: deployment-watcher + namespace: dse +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: deployment-reader + namespace: dse +rules: + - apiGroups: [ "apps" ] + resources: [ "deployments" ] + verbs: [ "get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: read-deployments + namespace: dse +subjects: + - kind: ServiceAccount + name: deployment-watcher + namespace: dse +roleRef: + kind: Role + name: deployment-reader + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/worker-jobs.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/worker-jobs.yaml new file mode 100644 index 000000000..733597e7c --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/templates/worker-jobs.yaml @@ -0,0 +1,49 @@ +{{- range .Values.workers }} +apiVersion: batch/v1 +kind: Job +metadata: + name: client{{ .num }} +spec: + template: + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-all + image: bitnami/kubectl + command: [ "sh", "-c" ] + args: + - > + while true; do + # Check if all deployments have at least one ready replica + notReadyCount=$(kubectl get deployments -n dse | grep -c "0/1") + if [ "$notReadyCount" -eq 0 ]; then + echo "All deployments are ready."; + break; + else + echo "Waiting for all deployments to be ready. $notReadyCount deployments are not ready."; + sleep 1; + fi; + done; + containers: + - command: + - "CoordinatorMicrobench/CoordinatorMicrobench" + args: + - "-t" + - "worker" + - "-i" + - "{{ .num }}" + - "-n" + - "{{ $.Values.num_workers }}" + - "-d" + - "{{ $.Values.dep_prob}}" + image: tianyuli96/faster:latest + name: "client{{ .num }}" + ports: + - containerPort: 15721 + envFrom: + - configMapRef: + name: env-config + restartPolicy: Never +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/CoordinatorMicrobench/helm-workload/values.yaml b/cs/research/darq/CoordinatorMicrobench/helm-workload/values.yaml new file mode 100644 index 000000000..c398e9068 --- /dev/null +++ b/cs/research/darq/CoordinatorMicrobench/helm-workload/values.yaml @@ -0,0 +1,16 @@ +conn_string: foo +results_conn_string: foo + +num_workers: 1 +dep_prob: 0.2 + +workers: + - num: 0 + - num: 1 + - num: 2 + - num: 3 + - num: 4 + - num: 5 + - num: 6 + - num: 7 + diff --git a/cs/research/darq/DistributedTransactions/DistributedTransactions.csproj b/cs/research/darq/DistributedTransactions/DistributedTransactions.csproj new file mode 100644 index 000000000..4c389644f --- /dev/null +++ b/cs/research/darq/DistributedTransactions/DistributedTransactions.csproj @@ -0,0 +1,71 @@ + + + + Exe + true + net7.0 + enable + enable + SimpleWorkflowBench + true + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + Server, Client + Public + True + True + obj\Debug\net7.0\ + MSBuild:Compile + + + + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + + diff --git a/cs/research/darq/DistributedTransactions/Environment.cs b/cs/research/darq/DistributedTransactions/Environment.cs new file mode 100644 index 000000000..76003c3e5 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/Environment.cs @@ -0,0 +1,273 @@ +using Azure.Storage.Blobs; +using FASTER.core; +using FASTER.devices; +using FASTER.libdpr; + +namespace DB { + +public interface IEnvironment +{ + public string GetOrchestratorConnString(); + + public int GetOrchestratorPort(Options options); + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options); + + public IDevice GetOrchestratorDevice(Options options); + + public string GetServiceConnString(int index); + + public int GetServicePort(Options options); + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options); + + public IDevice GetServiceDevice(Options options); + + public string GetDprFinderConnString(); + + public int GetDprFinderPort(); + + public PingPongDevice GetDprFinderDevice(); + + public Task PublishResultsAsync(string fileName, MemoryStream bytes); +} + +public class LocalDebugEnvironment : IEnvironment +{ + private int roundRobin; + public string GetOrchestratorConnString() + { + var port = roundRobin++ / 2 == 0 ? 15724 : 15725; + return $"http://127.0.0.1:{port}"; + } + + public int GetOrchestratorPort(Options options) + { + return options.WorkerName + 15721; + } + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\orchestrators{options.WorkerName}"), removeOutdated: false); + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) => + new ManagedLocalStorageDevice($"D:\\orchestator{options.WorkerName}.log", deleteOnClose: true); + + public string GetServiceConnString(int index) + { + return $"http://127.0.0.1:{15721 + index}"; + } + + public int GetServicePort(Options options) + { + return 15721 + options.WorkerName; + } + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\service{options.WorkerName}"), removeOutdated: false); + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) => + new ManagedLocalStorageDevice($"D:\\service{options.WorkerName}.log", deleteOnClose: true); + + public string GetDprFinderConnString() => "http://127.0.0.1:15720"; + + public int GetDprFinderPort() => 15720; + + public PingPongDevice GetDprFinderDevice() + { + var device1 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + var device2 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + return new PingPongDevice(device1, device2, true); + } + + public Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + Console.WriteLine($"Results for {fileName}:"); + var reader = new StreamReader(bytes); + var text = reader.ReadToEnd(); + // Print to console + Console.Write(text); + return Task.CompletedTask; + } +} + +public class KubernetesLocalStorageEnvironment : IEnvironment +{ + private bool cleanStart; + + public KubernetesLocalStorageEnvironment(bool cleanStart) + { + this.cleanStart = cleanStart; + } + + public string GetOrchestratorConnString() => "http://orchestrator.dse.svc.cluster.local:15721"; + + public int GetOrchestratorPort(Options options) => 15721; + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"/mnt/plrs/orchestrators{options.WorkerName}"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) + { + if (cleanStart) + ManagedLocalStorageDevice.RemoveIfPresent($"/mnt/plrs/orchestrator{options.WorkerName}.log"); + return new ManagedLocalStorageDevice($"/mnt/plrs/orchestrator{options.WorkerName}.log"); + } + + public string GetServiceConnString(int index) => $"http://service{index}.dse.svc.cluster.local:15721"; + + public int GetServicePort(Options options) => 15721; + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"/mnt/plrs/service{options.WorkerName}"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) + { + if (cleanStart) + ManagedLocalStorageDevice.RemoveIfPresent($"/mnt/plrs/service{options.WorkerName}.log"); + return new ManagedLocalStorageDevice($"/mnt/plrs/service{options.WorkerName}.log"); + } + + public string GetDprFinderConnString() => "http://dprfinder.dse.svc.cluster.local:15721"; + + public int GetDprFinderPort() => 15721; + + public PingPongDevice GetDprFinderDevice() + { + if (cleanStart) + { + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder1"); + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder2"); + } + + var device1 = new ManagedLocalStorageDevice("/mnt/plrs/finder1", recoverDevice: true); + var device2 = new ManagedLocalStorageDevice("/mnt/plrs/finder2", recoverDevice: true); + return new PingPongDevice(device1, device2, true); + } + + public async Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + await blobContainerClient.CreateIfNotExistsAsync(); + var blobClient = blobContainerClient.GetBlobClient(fileName); + + await blobClient.UploadAsync(bytes, overwrite: true); + } +} + +public class KubernetesCloudStorageEnvironment : IEnvironment +{ + private bool cleanStart; + + public KubernetesCloudStorageEnvironment(bool cleanStart) + { + this.cleanStart = cleanStart; + } + + public string GetOrchestratorConnString() => "http://orchestrator.dse.svc.cluster.local:15721"; + + public int GetOrchestratorPort(Options options) => 15721; + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new AzureStorageNamedDeviceFactory(Environment.GetEnvironmentVariable("AZURE_CONN_STRING")), + new DefaultCheckpointNamingScheme($"orchestrators/{options.WorkerName}/checkpoints"), + removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) + { + var result = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "orchestrators", + options.WorkerName.ToString(), "darq"); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public string GetServiceConnString(int index) => $"http://service{index}.dse.svc.cluster.local:15721"; + + public int GetServicePort(Options options) => 15721; + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new AzureStorageNamedDeviceFactory(Environment.GetEnvironmentVariable("AZURE_CONN_STRING")), + new DefaultCheckpointNamingScheme($"services/{options.WorkerName}/checkpoints"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) + { + var result = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "services", + options.WorkerName.ToString(), "log"); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public string GetDprFinderConnString() => "http://dprfinder.dse.svc.cluster.local:15721"; + + public int GetDprFinderPort() => 15721; + + public PingPongDevice GetDprFinderDevice() + { + var device1 = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "dprfinder", + "data", "1"); + var device2 = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "dprfinder", + "data", "2"); + if (cleanStart) + { + device1.PurgeAll(); + device2.PurgeAll(); + } + + return new PingPongDevice(device1, device2, true); + } + + public async Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + await blobContainerClient.CreateIfNotExistsAsync(); + var blobClient = blobContainerClient.GetBlobClient(fileName); + + await blobClient.UploadAsync(bytes, overwrite: true); + } +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/LogEntry.cs b/cs/research/darq/DistributedTransactions/LogEntry.cs new file mode 100644 index 000000000..29bf60945 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/LogEntry.cs @@ -0,0 +1,186 @@ + +using System.Runtime.Serialization.Formatters.Binary; +using System.Runtime.InteropServices; +using System.Diagnostics; + +namespace DB { + +public enum LogType { + Begin, + Write, + Commit, + Abort, + // used for 2pc, doesn't use LSN/prevLSN until being logged as a self message + Prepare, + Ok, + Ack +} + +public struct LogEntry{ + public bool persited = false; + public long lsn; // value of DarqId for Ack messages + public long prevLsn; // do we even need this if we are undoing? for Ack messages, this is original tid + public long tid; + public LogType type; + public PrimaryKey[] pks; + public TupleDesc[][] tupleDescs; + public byte[][] vals; + public static readonly int MinSize = sizeof(long) * 3 + sizeof(int); + public LogEntry(long prevLsn, long tid, PrimaryKey pk, TupleDesc[] tupleDesc, byte[] val){ + this.prevLsn = prevLsn; + this.tid = tid; + this.type = LogType.Write; + this.pks = new PrimaryKey[]{pk}; + this.tupleDescs = new TupleDesc[][]{tupleDesc}; + this.vals = new byte[][]{val}; + } + public LogEntry(long prevLsn, long tid, PrimaryKey[] pks, TupleDesc[][] tupleDesc, byte[][] val){ + this.prevLsn = prevLsn; + this.tid = tid; + this.type = LogType.Prepare; + this.pks = pks; + this.tupleDescs = tupleDesc; + this.vals = val; + } + + public LogEntry(long prevLsn, long tid, LogType type){ + this.prevLsn = prevLsn; + this.tid = tid; + this.type = type; + } + + public void SetPersisted(){ + persited = true; + } + + public unsafe byte[] ToBytes(){ + // Write and Prepare logs have pk, tupleDescs, and vals + int totalSize = MinSize + (vals != null ? sizeof(int) : 0); + if (vals != null) for (int i = 0; i < vals.Length; i++) totalSize += vals[i].Length + sizeof(int) * 2 + tupleDescs[i].Length * TupleDesc.SizeOf + PrimaryKey.SizeOf; + + byte[] arr = new byte[totalSize]; + + fixed (byte* b = arr) { + var head = b; + *(long*)head = lsn; + head += sizeof(long); + *(long*)head = prevLsn; + head += sizeof(long); + *(long*)head = tid; + head += sizeof(long); + *(int*)head = (int)type; + head += sizeof(int); + if (type == LogType.Write || type == LogType.Prepare){ + Debug.Assert(tupleDescs.Length == vals.Length); + Debug.Assert(tupleDescs.Length == pks.Length); + *(int*)head = tupleDescs.Length; + head += sizeof(int); + for (int i = 0; i < tupleDescs.Length; i++){ + pks[i].ToBytes().CopyTo(new Span(head, PrimaryKey.SizeOf)); + head += PrimaryKey.SizeOf; + + *(int*)head = tupleDescs[i].Length; + head += sizeof(int); + for (int j = 0; j < tupleDescs[i].Length; j++) { + tupleDescs[i][j].ToBytes().CopyTo(new Span(head, TupleDesc.SizeOf)); + head += TupleDesc.SizeOf; + } + + *(int*)head = vals[i].Length; + head += sizeof(int); + vals[i].CopyTo(new Span(head, vals[i].Length)); + head += vals[i].Length; + } + } + } + + return arr; + } + + public static unsafe LogEntry FromBytes(byte[] data) { + // Ensure that the data array has enough bytes for the struct + if (data.Length < MinSize) throw new ArgumentException("Insufficient data to deserialize the struct."); + + LogEntry result = new LogEntry(); + + fixed (byte* b = data) { + var head = b; + result.lsn = *(long*)head; + head += sizeof(long); + result.prevLsn = *(long*)head; + head += sizeof(long); + result.tid = *(long*)head; + head += sizeof(long); + result.type = (LogType)(*(int*)head); + head += sizeof(int); + if (result.type == LogType.Write || result.type == LogType.Prepare){ + int len = *(int*)head; + head += sizeof(int); + result.pks = new PrimaryKey[len]; + result.tupleDescs = new TupleDesc[len][]; + result.vals = new byte[len][]; + for (int i = 0; i < len; i++){ + result.pks[i] = PrimaryKey.FromBytes(new Span(head, PrimaryKey.SizeOf).ToArray()); + head += PrimaryKey.SizeOf; + + int tupleLen = *(int*)head; + head += sizeof(int); + result.tupleDescs[i] = new TupleDesc[tupleLen]; + for (int j = 0; j < tupleLen; j++){ + result.tupleDescs[i][j] = TupleDesc.FromBytes(new Span(head, TupleDesc.SizeOf).ToArray()); + head += TupleDesc.SizeOf; + } + + int valSize = *(int*)head; + head += sizeof(int); + result.vals[i] = new byte[valSize]; + new Span(head, valSize).CopyTo(result.vals[i]); + head += valSize; + } + + // int len = *(int*)head; + // head += sizeof(int); + // result.keyAttrs = new KeyAttr[len]; + // result.vals = new byte[len][]; + // for (int i = 0; i < len; i++){ + // int keySize = *(int*)head; + // head += sizeof(int); + // result.keyAttrs[i] = KeyAttr.FromBytes(new Span(head, keySize).ToArray()); + // head += keySize; + // int valSize = *(int*)head; + // head += sizeof(int); + // result.vals[i] = new byte[valSize]; + // new Span(head, valSize).CopyTo(result.vals[i]); + // head += valSize; + // } + } + } + + return result; + } + + public override readonly bool Equals(object? obj) + { + if (obj == null || GetType() != obj.GetType()) + { + return false; + } + LogEntry o = (LogEntry)obj; + if (vals.Length != o.vals.Length) return false; + if (o.lsn != lsn || o.prevLsn != prevLsn || o.tid != tid || o.type != type) return false; + for (int i = 0; i < vals.Length; i++){ + if (!vals[i].AsSpan().SequenceEqual(o.vals[i])) return false; + if (tupleDescs[i].Length != o.tupleDescs[i].Length) return false; + for (int j = 0; j < tupleDescs[i].Length; j++){ + if (!tupleDescs[i][j].Equals(o.tupleDescs[i][j])) return false; + } + } + return true; + } + + public override readonly string ToString(){ + return $"LogEntry(lsn={lsn}, prevLsn={prevLsn}, tid={tid}, type={type})"; + } + +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/Program.cs b/cs/research/darq/DistributedTransactions/Program.cs new file mode 100644 index 000000000..6d8a0fc9a --- /dev/null +++ b/cs/research/darq/DistributedTransactions/Program.cs @@ -0,0 +1,367 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Net; +using System.Text; +using CommandLine; +using FASTER.client; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; +using FASTER.libdpr.gRPC; +using Google.Protobuf; +using Grpc.Net.Client; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using dse.services; + +namespace DB; +public class Options +{ + [Option('t', "type", Required = true, + HelpText = "type of worker to launch")] + public string Type { get; set; } + + [Option('w', "workload-trace", Required = false, + HelpText = "Workload trace file to use")] + public string WorkloadTrace { get; set; } + + [Option('o', "output-file", Required = false, + HelpText = "Name of file to output")] + public string OutputFile { get; set; } + + [Option('n', "name", Required = false, + HelpText = "identifier of the service to launch")] + public int WorkerName { get; set; } + + [Option('s', "speculative", Required = false, Default = false, + HelpText = "whether services proceed speculatively")] + public bool Speculative { get; set; } + + [Option('i', "issue-window", Required = false, Default = 128, + HelpText = "how many requests can be concurrently in-flight")] + public int IssueWindow { get; set; } +} + +public class Program +{ + public static async Task Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + // var environment = new LocalDebugEnvironment(); + var environment = new KubernetesLocalStorageEnvironment(true); + + switch (options.Type.Trim()) + { + case "client": + Console.WriteLine("Starting client"); + await LaunchBenchmarkClient(options, environment); + break; + case "orchestrator": + Console.WriteLine("Starting orchestrator"); + await LaunchOrchestratorService(options, environment); + break; + case "service": + Console.WriteLine("Starting distributed transaction service"); + await LaunchReservationService(options, environment); + break; + case "dprfinder": + Console.WriteLine("Starting DPR finder"); + await LaunchDprFinder(options, environment); + break; + // case "generate": + // new WorkloadGenerator() + // .SetNumClients(1) + // .SetNumServices(3) + // .SetNumWorkflowsPerSecond(100) + // .SetNumSeconds(120) + // .SetNumOfferings(10000) + // .SetBaseFileName("C:\\Users\\tianyu\\Desktop\\workloads\\test") + // .GenerateWorkloadTrace(new Random()); + // break; + default: + throw new NotImplementedException(); + } + } + + private static async Task LaunchBenchmarkClient(Options options, IEnvironment environment) + { + Console.WriteLine("Parsing workload file..."); + var timedRequests = new List<(long, ExecuteWorkflowRequest)>(); + foreach (var line in File.ReadLines(options.WorkloadTrace)) + { + var args = line.Split(','); + var timestamp = long.Parse(args[0]); + + var request = new ExecuteWorkflowRequest + { + WorkflowId = long.Parse(args[1]), + WorkflowClassId = 0, + Input = ByteString.CopyFrom(line, Encoding.UTF8) + }; + timedRequests.Add(ValueTuple.Create(timestamp, request)); + } + + Console.WriteLine("Creating gRPC connections..."); + // Keep a few channels around and reuse them + var channelPool = new List(); + for (var i = 0; i < 8; i++) + // k8 load-balancing will ensure that we get a spread of different orchestrators behind these channels + channelPool.Add(GrpcChannel.ForAddress(environment.GetOrchestratorConnString())); + var measurements = new ConcurrentBag(); + var stopwatch = Stopwatch.StartNew(); + Console.WriteLine("Starting Workload..."); + var rateLimiter = new SemaphoreSlim(options.IssueWindow, options.IssueWindow); + for (var i = 0; i < timedRequests.Count; i++) + { + var request = timedRequests[i]; + while (stopwatch.ElapsedMilliseconds <= request.Item1) + Thread.Yield(); + var channel = channelPool[i % channelPool.Count]; + var client = new WorkflowOrchestrator.WorkflowOrchestratorClient(channel); + await rateLimiter.WaitAsync(); + _ = Task.Run(async () => + { + // Console.WriteLine($"Issuing request to start workflow id:{request.Item2.WorkflowId}, request content: {request.Item2.Input.ToString(Encoding.UTF8)}"); + await client.ExecuteWorkflowAsync(request.Item2); + var endTime = stopwatch.ElapsedMilliseconds; + // Console.WriteLine($"workflow id:{request.Item2.WorkflowId} has completed in {endTime - request.Item1} milliseconds"); + measurements.Add(endTime - request.Item1); + rateLimiter.Release(); + }); + } + + while (measurements.Count != timedRequests.Count) + await Task.Delay(5); + await WriteResults(options, environment, measurements); + } + + private static async Task WriteResults(Options options, IEnvironment environment,ConcurrentBag measurements) + { + using var memoryStream = new MemoryStream(); + await using var streamWriter = new StreamWriter(memoryStream); + foreach (var line in measurements) + streamWriter.WriteLine(line); + await streamWriter.FlushAsync(); + memoryStream.Position = 0; + await environment.PublishResultsAsync(options.OutputFile, memoryStream); + } + + public static async Task LaunchOrchestratorService(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetOrchestratorPort(options), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + + var checkpointManager = environment.GetOrchestratorCheckpointManager(options); + builder.Services.AddSingleton(new DarqSettings + { + MyDpr = new DprWorkerId(options.WorkerName), + DprFinder = new GrpcDprFinder(GrpcChannel.ForAddress(environment.GetDprFinderConnString())), + LogDevice = environment.GetOrchestratorDevice(options), + LogCommitManager = checkpointManager, + PageSize = 1L << 22, + MemorySize = 1L << 30, + SegmentSize = 1L << 30, + CheckpointPeriodMilli = 10, + RefreshPeriodMilli = 5, + FastCommitMode = true, + }); + // TODO(Tianyu): Switch to epoch after testing + builder.Services.AddSingleton(typeof(IVersionScheme), typeof(RwLatchVersionScheme)); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(sp => sp.GetService()); + builder.Services.AddSingleton(new DarqMaintenanceBackgroundServiceSettings + { + morselSize = 512, + batchSize = 16, + // TODO(ophelia): put TransactionProcessorProducerWrapper here?? + producerFactory = null, + speculative = true + }); + + var connectionPool = new ConcurrentDictionary(); + + + builder.Services.AddSingleton(new DarqWal(new DarqId(options.WorkerName))); + builder.Services.AddSingleton(new TpccRpcClient(options.WorkerName, connectionPool.ToDictionary(kvp => (long)kvp.Key, kvp => kvp.Value))); + + + builder.Services.AddSingleton(provider => { + Dictionary tables = new Dictionary(); + foreach (TableType tEnum in Enum.GetValues(typeof(TableType))){ + (long, int)[] schema; + switch (tEnum) { + case TableType.Warehouse: + schema = TpccSchema.WAREHOUSE_SCHEMA; + break; + case TableType.District: + schema = TpccSchema.DISTRICT_SCHEMA; + break; + case TableType.Customer: + schema = TpccSchema.CUSTOMER_SCHEMA; + break; + case TableType.History: + schema = TpccSchema.HISTORY_SCHEMA; + break; + case TableType.Item: + schema = TpccSchema.ITEM_SCHEMA; + break; + case TableType.NewOrder: + schema = TpccSchema.NEW_ORDER_SCHEMA; + break; + case TableType.Order: + schema = TpccSchema.ORDER_SCHEMA; + break; + case TableType.OrderLine: + schema = TpccSchema.ORDER_LINE_SCHEMA; + break; + case TableType.Stock: + schema = TpccSchema.STOCK_SCHEMA; + break; + default: + throw new Exception("Invalid table type"); + } + int i = (int)tEnum; + tables[i] = new ShardedTable( + i, + schema, + provider.GetRequiredService() + ); + } + return tables; + }); + + builder.Services.AddSingleton(provider => new ShardedTransactionManager( + 12, + provider.GetRequiredService(), + provider.GetRequiredService>(), + wal: provider.GetRequiredService() + )); + + builder.Services.AddSingleton( + provider => new DarqTransactionProcessorBackgroundService( + options.WorkerName, + provider.GetRequiredService>(), + provider.GetRequiredService(), + provider.GetRequiredService(), + provider.GetRequiredService(), + provider.GetRequiredService>() + ) + ); + builder.Services.AddSingleton(); + builder.Services.AddSingleton>(); + + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(); + builder.Services.AddHostedService(); + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + foreach (var channel in connectionPool.Values) + channel.Dispose(); + } + + public static async Task LaunchDprFinder(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetDprFinderPort(), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + using var dprFinderServiceDevice = environment.GetDprFinderDevice(); + builder.Services.AddSingleton(dprFinderServiceDevice); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + + builder.Services.AddGrpc(); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } + + public static async Task LaunchReservationService(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetServicePort(options), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + var checkpointManager = environment.GetServiceCheckpointManager(options); + builder.Services.AddSingleton(new FasterKVSettings + { + IndexSize = 1 << 25, + LogDevice = environment.GetServiceDevice(options), + PageSize = 1 << 25, + SegmentSize = 1 << 30, + MemorySize = 1 << 30, + CheckpointManager = checkpointManager, + TryRecoverLatest = false, + }); + builder.Services.AddSingleton>(); + builder.Services.AddSingleton(new DprWorkerOptions + { + Me = new DprWorkerId(options.WorkerName), + DprFinder = new GrpcDprFinder(GrpcChannel.ForAddress(environment.GetDprFinderConnString())), + CheckpointPeriodMilli = 10, + RefreshPeriodMilli = 5 + }); + // TODO(Tianyu): Switch implementation to epoch after testing + builder.Services.AddSingleton(typeof(IVersionScheme), typeof(RwLatchVersionScheme)); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(new FasterKvReservationStartFile + { + file = options.WorkloadTrace + }); + builder.Services.AddSingleton(); + + builder.Services.AddSingleton(); + builder.Services.AddSingleton(sp => sp.GetService()); + builder.Services.AddSingleton>(); + + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/RpcClient.cs b/cs/research/darq/DistributedTransactions/RpcClient.cs new file mode 100644 index 000000000..d41200c7b --- /dev/null +++ b/cs/research/darq/DistributedTransactions/RpcClient.cs @@ -0,0 +1,128 @@ +using Grpc.Net.Client; +using Google.Protobuf; +using FASTER.client; +using FASTER.libdpr; +using FASTER.darq; +using System.Collections.Concurrent; + + +namespace DB { +public abstract class RpcClient { + protected long partitionId; + protected Dictionary clusterMap; + + public RpcClient(long partitionId, Dictionary clusterMap){ + this.partitionId = partitionId; + this.clusterMap = clusterMap; + } + + public long GetId(){ + return partitionId; + } + public ReadOnlySpan Read(PrimaryKey key, TransactionContext ctx){ + var channel = GetServerChannel(key); + var client = new TransactionProcessor.TransactionProcessorClient(channel); + PbPrimaryKey pk = new PbPrimaryKey { Keys = {key.Key1, key.Key2, key.Key3, key.Key4, key.Key5, key.Key6}, Table = key.Table}; + + var reply = client.Read(new ReadRequest { Key = pk, Tid = ctx.tid, PartitionId = partitionId}); + + return reply.Value.ToByteArray(); + } + + public (byte[], PrimaryKey) ReadSecondary(PrimaryKey tempPk, byte[] key, TransactionContext ctx){ + var channel = GetServerChannel(tempPk); + var client = new TransactionProcessor.TransactionProcessorClient(channel); + + var reply = client.ReadSecondary( + new ReadSecondaryRequest { + Key = ByteString.CopyFrom(key), + Table = tempPk.Table, + Tid = ctx.tid, + PartitionId = partitionId + } + ); + return (reply.Value.ToByteArray(), new PrimaryKey(tempPk.Table, reply.Key.Keys.ToArray()[0], reply.Key.Keys.ToArray()[1], reply.Key.Keys.ToArray()[2], reply.Key.Keys.ToArray()[3], reply.Key.Keys.ToArray()[4], reply.Key.Keys.ToArray()[5])); + } + + public void PopulateTables(BenchmarkConfig cfg, TpccConfig tpccCfg){ + foreach (var entry in clusterMap){ + if (entry.Key == partitionId) continue; + var channel = entry.Value; + var client = new TransactionProcessor.TransactionProcessorClient(channel); + var reply = client.PopulateTables( + new PopulateTablesRequest { + Seed = cfg.seed, + Ratio = cfg.ratio, + ThreadCount = cfg.threadCount, + AttrCount = cfg.attrCount, + PerThreadDataCount = cfg.perThreadDataCount, + IterationCount = cfg.iterationCount, + PerTransactionCount = cfg.perTransactionCount, + NCommitterThreads = cfg.nCommitterThreads, + NumWh = tpccCfg.NumWh, + NumDistrict = tpccCfg.NumDistrict, + NumCustomer = tpccCfg.NumCustomer, + NumItem = tpccCfg.NumItem, + NumOrder = tpccCfg.NumOrder, + NumStock = tpccCfg.NumStock, + NewOrderCrossPartitionProbability = tpccCfg.NewOrderCrossPartitionProbability, + PaymentCrossPartitionProbability = tpccCfg.PaymentCrossPartitionProbability, + PartitionsPerThread = tpccCfg.PartitionsPerThread + } + ); + if (!reply.Success) throw new System.Exception("Failed to populate tables"); + } + } + + + /// + /// Returns the appropriate channel to talk to correct shard + /// + /// + /// Null if key maps to itself, appropriate channel otherwise + private GrpcChannel? GetServerChannel(PrimaryKey key){ + var id = HashKeyToDarqId(key); + if (id == partitionId) return null; + return clusterMap[id]; + } + + abstract public long HashKeyToDarqId(PrimaryKey key); + + public bool IsLocalKey(PrimaryKey key){ + if (key.Table == (int)TableType.Item) return true; + return HashKeyToDarqId(key) == partitionId; + } + + public int GetNumServers(){ + return clusterMap.Count(); + } +} + +public class TpccRpcClient : RpcClient +{ + public TpccRpcClient(long partitionId, Dictionary clusterMap) : base(partitionId, clusterMap) + { + } + + public override long HashKeyToDarqId(PrimaryKey key){ + // return partitionId; + if (key.Table == (int)TableType.Item) return partitionId; + return (key.Key1 - 1) / 4; // TODO: make it / partitionsPerThread * threadCount + } +} + +public class YcsbRpcClient : RpcClient +{ + public YcsbRpcClient(long partitionId, Dictionary clusterMap) : base(partitionId, clusterMap) + { + } + + public override long HashKeyToDarqId(PrimaryKey key){ + // TODO: arbitrary for now, define some rules for how to map keys to servers + // return key.Keys[0] % clusterMap.Count; + return 0; + } + +} + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/Table.cs b/cs/research/darq/DistributedTransactions/Table.cs new file mode 100644 index 000000000..8f784ee27 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/Table.cs @@ -0,0 +1,383 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Concurrent; +using System.Reflection; +using System.Runtime.InteropServices; +using System.Text; +using System.Runtime.CompilerServices; +using FASTER.common; + +[assembly:InternalsVisibleTo("TableTests")] +namespace DB { + +/// +/// Stores data as a byte array (TODO: change to generic type) +/// Variable length pointers are stored as {size_of_variable}{address_of_variable} and have a size of IntPtr.Size * 2 +/// Always uses ReadOnlySpan with the user +/// Assumes schema is never changed after creation +/// +public unsafe class Table : IDisposable{ + private int id; + private long lastId = 0; + public int rowSize; + // TODO: bool can be a single bit + internal long[] metadataOrder; + internal Dictionary metadata; // (size, offset), size=-1 if varLen + internal ConcurrentDictionary data; + + // Secondary index + protected ConcurrentDictionary secondaryIndex; + + protected ILogger logger; + IWriteAheadLog? wal; + + public Table(int id, (long, int)[] schema, IWriteAheadLog? wal = null, ILogger logger = null){ + this.id = id; + this.metadata = new Dictionary(); + this.metadataOrder = new long[schema.Length]; + this.wal = wal; + this.logger = logger; + this.secondaryIndex = new ConcurrentDictionary(new ByteArrayComparer()); + + int offset = 0; + int size = 0; + for(int i = 0; i < schema.Length; i++) { + long attr = schema[i].Item1; + size = schema[i].Item2; + if (size <= 0 && size != -1) { + throw new ArgumentException(); + } + this.metadata[attr] = (size, offset); + this.metadataOrder[i] = attr; + offset += (size == -1) ? IntPtr.Size * 2 : size; + } + this.rowSize = offset; + this.data = new ConcurrentDictionary(); + } + + // will never return null, empty + virtual public ReadOnlySpan Read(PrimaryKey tupleId, TupleDesc[] tupleDescs, TransactionContext ctx) { + Validate(tupleDescs, null, false); + // PrintDebug($"Reading normal {tupleId}", ctx); + + ReadOnlySpan value = ctx.GetFromReadset(tupleId); + if (value == null) { + value = Read(tupleId); + } + + ReadOnlySpan result; + // apply writeset + (TupleDesc[], byte[]) changes = ctx.GetFromWriteset(tupleId); + if (changes.Item2 != null) { + Span updatedValue = value.ToArray(); + foreach (TupleDesc td in changes.Item1) { + int offset = this.metadata[td.Attr].Item2; + changes.Item2.AsSpan(td.Offset, td.Size).CopyTo(updatedValue.Slice(offset)); + } + result = updatedValue; + } else { + result = value; + } + // TODO: deal with varLen + // project out the attributes + ctx.AddReadSet(tupleId, result); + return project(result, tupleDescs); + } + + virtual public (byte[], PrimaryKey) ReadSecondary(byte[] key, TupleDesc[] tupleDescs, TransactionContext ctx){ + if (secondaryIndex == null){ + throw new InvalidOperationException("Secondary index not set"); + } + PrimaryKey pk = secondaryIndex[key]; + return (Read(pk, tupleDescs, ctx).ToArray(), pk); + } + + protected ReadOnlySpan project(ReadOnlySpan value, TupleDesc[] tupleDescs){ + // TODO: do this without allocating + int totalSize = tupleDescs[tupleDescs.Length - 1].Offset + tupleDescs[tupleDescs.Length - 1].Size; + + Span result = new byte[totalSize]; + foreach (TupleDesc td in tupleDescs){ + int offset = this.metadata[td.Attr].Item2; + value.Slice(offset, td.Size).CopyTo(result.Slice(td.Offset, td.Size)); + } + return result; + } + + // Assumes attribute is valid + protected internal ReadOnlySpan Read(PrimaryKey tupleId){ + if (!this.data.ContainsKey(tupleId)){ // TODO: validate table + return new byte[this.rowSize]; + } + // TODO: deal with varLen + return this.data[tupleId]; + } + + protected Pointer GetVarLenPtr(PrimaryKey key, int offset){ + byte[] addr = (new ReadOnlySpan(this.data[key], offset + IntPtr.Size, IntPtr.Size)).ToArray(); + byte[] size = (new ReadOnlySpan(this.data[key], offset, IntPtr.Size)).ToArray(); + IntPtr res = new IntPtr(BitConverter.ToInt64(addr)); //TODO convert based on nint size + return new Pointer(res, BitConverter.ToInt32(size)); + } + + /// + /// Insert entire row + /// + /// + /// + /// + /// + public PrimaryKey Insert(ReadOnlySpan value, TransactionContext ctx){ + if (value.Length != this.rowSize){ + throw new ArgumentException($"Expected size {this.rowSize} for new record but instead got size {value.Length}"); + } + + long id = NewRecordId(); // TODO: make sure this new record id falls within range of this partition in shardedBenchmark + PrimaryKey tupleId = new PrimaryKey(this.id, id); + ctx.AddWriteSet(ref tupleId, GetSchema(), value); + if (wal != null){ + wal.Write(ctx.tid, ref tupleId, GetSchema(), value.ToArray()); + } + + return tupleId; + } + + /// + /// Insert specified attributes into table. Non-specified attributes will be 0 + /// + /// + /// + /// + /// Key already exists + /// whether insert succeeded + public bool Insert(ref PrimaryKey id, ReadOnlySpan value, TransactionContext ctx){ + if (value.Length != this.rowSize){ + throw new ArgumentException($"Expected size {this.rowSize} for new record but instead got size {value.Length}"); + } + // PrintDebug($"Inserting {id}", ctx); + if (this.data.ContainsKey(id)){ + return false; + } + if (wal != null){ + wal.Write(ctx.tid, ref id, GetSchema(), value.ToArray()); + } + ctx.AddWriteSet(ref id, GetSchema(), value); + + return true; + } + + /// + /// Update values described by tupleDescs + /// + /// + /// Describes size and offset of what is in value + /// + /// + /// + public void Update(ref PrimaryKey tupleId, TupleDesc[] tupleDescs, ReadOnlySpan value, TransactionContext ctx){ + // TODO: how to check it already exists in other shard? + // if (!this.data.ContainsKey(tupleId) && !ctx.InWriteSet(ref tupleId)){ + // throw new ArgumentException($"Key {tupleId} does not exist"); + // } + Validate(tupleDescs, value, true); + + ctx.AddWriteSet(ref tupleId, tupleDescs, value); + if (wal != null){ + wal.Write(ctx.tid, ref tupleId, tupleDescs, value.ToArray()); + } + } + + /// + /// Write value to specific attribute of key. If key does not exist yet, this is an insert + /// /// + /// + /// + protected internal void Write(ref PrimaryKey pk, TupleDesc[] tds, byte[] value){ + // TODO: is it safe to assume if key exists in writeset, it is an update? + // this will receive pk over and over again with tds building + if (!this.data.ContainsKey(pk)){ + // insert + if (value.Length != this.rowSize){ + throw new ArgumentException($"Expected size {this.rowSize} for new record {pk} but instead got size {value.Length}"); + } + this.data[pk] = value; + } else { + // update + int start = 0; + foreach (TupleDesc td in tds){ + (int size, int offset) = this.metadata[td.Attr]; + value.AsSpan(start,td.Size).CopyTo(this.data[pk].AsSpan(offset)); + start += td.Size; + } + } + } + + public void AddSecondaryIndex(Dictionary index){ + foreach (var entry in index){ + bool success = secondaryIndex.TryAdd(entry.Key, entry.Value); + if (!success){ + throw new ArgumentException($"Secondary index already has {entry.Key}"); + } + } + } + + public void Dispose(){ + // iterate through all of the table to find pointers and dispose of + foreach (var field in metadata){ + if (field.Value.Item1 == -1) { + int offset = field.Value.Item2; + foreach (var entry in data){ + IntPtr ptrToFree = GetVarLenPtr(entry.Key, offset).IntPointer; + if (ptrToFree != IntPtr.Zero){ + Marshal.FreeHGlobal(ptrToFree); + } + } + } + } + } + + public TupleDesc[] GetSchema(){ + TupleDesc[] schema = new TupleDesc[this.metadata.Count]; + for (int i = 0; i < this.metadata.Count; i++){ + long attr = this.metadataOrder[i]; + schema[i] = new TupleDesc(attr, this.metadata[attr].Item1, this.metadata[attr].Item2); + } + return schema; + } + + public (int, int) GetAttrMetadata(long attr){ + return this.metadata[attr]; + } + + public int GetId(){ + return this.id; + } + + virtual public void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[Table TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } + + public void PrintTable(){ + Console.WriteLine("Metadata: "); + foreach (var field in metadata){ + Console.Write($"{field.Key} is {field.Value.Item1 == -1} {field.Value.Item1} {field.Value.Item2}\n"); + // for (int i=0; i < entry.Value.Length; i++) { + // System.Console.Write(entry.Value[i] + ","); + // } + // Console.WriteLine("\n");// + Encoding.ASCII.GetBytes(entry.Value)); + } + foreach (var entry in data){ + Console.WriteLine(entry.Key); + for (int i=0; i < entry.Value.Length; i++) { + System.Console.Write(entry.Value[i] + ","); + } + Console.WriteLine("\n");// + Encoding.ASCII.GetBytes(entry.Value)); + } + } + + protected long NewRecordId() { + return Interlocked.Increment(ref lastId); + } + + protected void Validate(TupleDesc[] tupleDescs, ReadOnlySpan value, bool write) { + int totalSize = 0; + foreach (TupleDesc desc in tupleDescs) { + if (!this.metadata.ContainsKey(desc.Attr)) { + throw new ArgumentException($"Attribute {desc.Attr} is not a valid attribute for this table"); + } + if (this.metadata[desc.Attr].Item1 != -1 && desc.Size != this.metadata[desc.Attr].Item1) { + throw new ArgumentException($"Expected size {this.metadata[desc.Attr].Item1} for attribute {desc.Attr} but instead got size {desc.Size}"); + } + totalSize += desc.Size; + } + if (write && totalSize > value.Length) { + throw new ArgumentException($"Expected size {totalSize} from tuple description but instead got size {value.Length}"); + } + } + +} + +public class ShardedTable : Table { + private RpcClient rpcClient; + // extracts relevant values from secondary key to primary key for correct shard + protected Func buildTempPk; + public ShardedTable(int id, (long, int)[] schema, RpcClient rpcClient, IWriteAheadLog? wal = null, ILogger logger = null) : base(id, schema, wal, logger) { + this.rpcClient = rpcClient; + } + + public override ReadOnlySpan Read(PrimaryKey tupleId, TupleDesc[] tupleDescs, TransactionContext ctx) { + Validate(tupleDescs, null, false); + // PrintDebug($"Reading {tupleId}", ctx); + + ReadOnlySpan value = ctx.GetFromReadset(tupleId); + if (value == null) { + if (rpcClient.IsLocalKey(tupleId)) { + value = Read(tupleId); + } else { + value = rpcClient.Read(tupleId, ctx); + } + } + + ReadOnlySpan result; + // apply writeset + (TupleDesc[], byte[]) changes = ctx.GetFromWriteset(tupleId); + if (changes.Item2 != null) { + Span updatedValue = value.ToArray(); + foreach (TupleDesc td in changes.Item1) { + int offset = this.metadata[td.Attr].Item2; + changes.Item2.AsSpan(td.Offset, td.Size).CopyTo(updatedValue.Slice(offset)); + } + result = updatedValue; + } else { + result = value; + } + // TODO: deal with varLen + // project out the attributes + ctx.AddReadSet(tupleId, result); + return project(result, tupleDescs); + } + + override public (byte[], PrimaryKey) ReadSecondary(byte[] key, TupleDesc[] tupleDescs, TransactionContext ctx){ + if (secondaryIndex == null){ + throw new InvalidOperationException("Secondary index not set"); + } + ReadOnlySpan value; + PrimaryKey pk; + PrimaryKey tempPk = buildTempPk(key); + if (rpcClient.IsLocalKey(tempPk)){ + pk = secondaryIndex[key]; + value = Read(pk, tupleDescs, ctx).ToArray(); + } else { + (value, pk) = rpcClient.ReadSecondary(tempPk, key, ctx); + } + + ReadOnlySpan result; + // apply writeset + (TupleDesc[], byte[]) changes = ctx.GetFromWriteset(pk); + if (changes.Item2 != null) { + Span updatedValue = value.ToArray(); + foreach (TupleDesc td in changes.Item1) { + int offset = this.metadata[td.Attr].Item2; + changes.Item2.AsSpan(td.Offset, td.Size).CopyTo(updatedValue.Slice(offset)); + } + result = updatedValue; + } else { + result = value; + } + // TODO: deal with varLen + // project out the attributes + ctx.AddReadSet(pk, result); + return (project(result, tupleDescs).ToArray(), pk); + } + + public void AddSecondaryIndex(Dictionary index, Func buildTempPk){ + base.AddSecondaryIndex(index); + this.buildTempPk = buildTempPk; + } + + override public void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[ST {rpcClient.GetId()} TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TpccBenchmark/Definitions.cs b/cs/research/darq/DistributedTransactions/TpccBenchmark/Definitions.cs new file mode 100644 index 000000000..22b326019 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TpccBenchmark/Definitions.cs @@ -0,0 +1,230 @@ +namespace DB { +public enum TableType{ + Warehouse, + District, + Customer, + History, + Order, + NewOrder, + Item, + OrderLine, + Stock +} +public enum TableField{ + W_ID, + W_NAME, + W_STREET_1, + W_STREET_2, + W_CITY, + W_STATE, + W_ZIP, + W_TAX, + W_YTD, + D_ID, + D_W_ID, + D_NAME, + D_STREET_1, + D_STREET_2, + D_CITY, + D_STATE, + D_ZIP, + D_TAX, + D_YTD, + D_NEXT_O_ID, + C_ID, + C_D_ID, + C_W_ID, + C_FIRST, + C_MIDDLE, + C_LAST, + C_STREET_1, + C_STREET_2, + C_CITY, + C_STATE, + C_ZIP, + C_PHONE, + C_SINCE, + C_CREDIT, + C_CREDIT_LIM, + C_DISCOUNT, + C_BALANCE, + C_YTD_PAYMENT, + C_PAYMENT_CNT, + C_DELIVERY_CNT, + C_DATA, + H_C_ID, + H_C_D_ID, + H_C_W_ID, + H_D_ID, + H_W_ID, + H_DATE, + H_AMOUNT, + H_DATA, + NO_O_ID, + NO_D_ID, + NO_W_ID, + O_ID, + O_D_ID, + O_W_ID, + O_C_ID, + O_ENTRY_D, + O_CARRIER_ID, + O_OL_CNT, + O_ALL_LOCAL, + OL_O_ID, + OL_D_ID, + OL_W_ID, + OL_NUMBER, + OL_I_ID, + OL_SUPPLY_W_ID, + OL_DELIVERY_D, + OL_QUANTITY, + OL_AMOUNT, + OL_DIST_INFO, + I_ID, + I_IM_ID, + I_NAME, + I_PRICE, + I_DATA, + S_ID, + S_W_ID, + S_QUANTITY, + S_DIST_01, + S_DIST_02, + S_DIST_03, + S_DIST_04, + S_DIST_05, + S_DIST_06, + S_DIST_07, + S_DIST_08, + S_DIST_09, + S_DIST_10, + S_YTD, + S_ORDER_CNT, + S_REMOTE_CNT, + S_DATA +} + +// value fields fit smallest data type possible for ids, despite all being converted to long +public static class TpccSchema { + public static Dictionary> tablesToFields = new Dictionary> { + {TableType.Warehouse, new List{ TableField.W_NAME, TableField.W_STREET_1, TableField.W_STREET_2, TableField.W_CITY, TableField.W_STATE, TableField.W_ZIP, TableField.W_TAX, TableField.W_YTD}}, + {TableType.District, new List{TableField.D_NAME, TableField.D_STREET_1, TableField.D_STREET_2, TableField.D_CITY, TableField.D_STATE, TableField.D_ZIP, TableField.D_TAX, TableField.D_YTD, TableField.D_NEXT_O_ID}}, + {TableType.Customer, new List{TableField.C_FIRST, TableField.C_MIDDLE, TableField.C_LAST, TableField.C_STREET_1, TableField.C_STREET_2, TableField.C_CITY, TableField.C_STATE, TableField.C_ZIP, TableField.C_PHONE, TableField.C_SINCE, TableField.C_CREDIT, TableField.C_CREDIT_LIM, TableField.C_DISCOUNT, TableField.C_BALANCE, TableField.C_YTD_PAYMENT, TableField.C_PAYMENT_CNT, TableField.C_DELIVERY_CNT, TableField.C_DATA}}, + {TableType.History, new List{TableField.H_AMOUNT, TableField.H_DATA}}, + {TableType.Order, new List{TableField.O_C_ID, TableField.O_ENTRY_D, TableField.O_CARRIER_ID, TableField.O_OL_CNT, TableField.O_ALL_LOCAL}}, + {TableType.NewOrder, new List()}, + {TableType.Item, new List{TableField.I_IM_ID, TableField.I_NAME, TableField.I_PRICE, TableField.I_DATA}}, + {TableType.OrderLine, new List{TableField.OL_I_ID, TableField.OL_SUPPLY_W_ID, TableField.OL_DELIVERY_D, TableField.OL_QUANTITY, TableField.OL_AMOUNT, TableField.OL_DIST_INFO}}, + {TableType.Stock, new List{TableField.S_QUANTITY, TableField.S_DIST_01, TableField.S_DIST_02, TableField.S_DIST_03, TableField.S_DIST_04, TableField.S_DIST_05, TableField.S_DIST_06, TableField.S_DIST_07, TableField.S_DIST_08, TableField.S_DIST_09, TableField.S_DIST_10, TableField.S_YTD, TableField.S_ORDER_CNT, TableField.S_REMOTE_CNT, TableField.S_DATA}} + }; + + public static Dictionary fieldsToSchema = new Dictionary{ + // {TableField.W_ID, (8, typeof(long))}, + {TableField.W_NAME, (10, typeof(string))}, + {TableField.W_STREET_1, (20, typeof(string))}, + {TableField.W_STREET_2, (20, typeof(string))}, + {TableField.W_CITY, (20, typeof(string))}, + {TableField.W_STATE, (2, typeof(string))}, + {TableField.W_ZIP, (9, typeof(string))}, + {TableField.W_TAX, (4, typeof(float))}, + {TableField.W_YTD, (4, typeof(float))}, + // {TableField.D_ID, (1, typeof(byte))}, + // {TableField.D_W_ID, (8, typeof(long))}, + {TableField.D_NAME, (10, typeof(string))}, + {TableField.D_STREET_1, (20, typeof(string))}, + {TableField.D_STREET_2, (20, typeof(string))}, + {TableField.D_CITY, (20, typeof(string))}, + {TableField.D_STATE, (2, typeof(string))}, + {TableField.D_ZIP, (9, typeof(string))}, + {TableField.D_TAX, (4, typeof(float))}, + {TableField.D_YTD, (4, typeof(float))}, + {TableField.D_NEXT_O_ID, (4, typeof(int))}, + // {TableField.C_ID, (4, typeof(int))}, + // {TableField.C_D_ID, (1, typeof(byte))}, + // {TableField.C_W_ID, (8, typeof(long))}, + {TableField.C_FIRST, (16, typeof(string))}, + {TableField.C_MIDDLE, (2, typeof(string))}, + {TableField.C_LAST, (16, typeof(string))}, + {TableField.C_STREET_1, (20, typeof(string))}, + {TableField.C_STREET_2, (20, typeof(string))}, + {TableField.C_CITY, (20, typeof(string))}, + {TableField.C_STATE, (2, typeof(string))}, + {TableField.C_ZIP, (9, typeof(string))}, + {TableField.C_PHONE, (16, typeof(string))}, + {TableField.C_SINCE, (8, typeof(DateTime))}, + {TableField.C_CREDIT, (2, typeof(byte[]))}, + {TableField.C_CREDIT_LIM, (4, typeof(float))}, + {TableField.C_DISCOUNT, (4, typeof(float))}, + {TableField.C_BALANCE, (4, typeof(float))}, + {TableField.C_YTD_PAYMENT, (4, typeof(float))}, + {TableField.C_PAYMENT_CNT, (4, typeof(int))}, + {TableField.C_DELIVERY_CNT, (4, typeof(int))}, + {TableField.C_DATA, (500, typeof(string))}, + // {TableField.H_C_ID, (4, typeof(int))}, + // {TableField.H_C_D_ID, (1, typeof(byte))}, + // {TableField.H_C_W_ID, (8, typeof(long))}, + // {TableField.H_D_ID, (1, typeof(byte))}, + // {TableField.H_W_ID, (8, typeof(long))}, + // {TableField.H_DATE, (8, typeof(DateTime))}, + {TableField.H_AMOUNT, (4, typeof(float))}, + {TableField.H_DATA, (24, typeof(string))}, + // {TableField.NO_O_ID, (4, typeof(int))}, + // {TableField.NO_D_ID, (1, typeof(byte))}, + // {TableField.NO_W_ID, (8, typeof(long))}, + // {TableField.O_ID, (4, typeof(int))}, + // {TableField.O_D_ID, (1, typeof(byte))}, + // {TableField.O_W_ID, (8, typeof(long))}, + {TableField.O_C_ID, (4, typeof(long))}, + {TableField.O_ENTRY_D, (8, typeof(DateTime))}, + {TableField.O_CARRIER_ID, (4, typeof(byte))}, + {TableField.O_OL_CNT, (4, typeof(int))}, + {TableField.O_ALL_LOCAL, (4, typeof(int))}, + // {TableField.OL_O_ID, (4, typeof(int))}, + // {TableField.OL_D_ID, (1, typeof(byte))}, + // {TableField.OL_W_ID, (8, typeof(long))}, + // {TableField.OL_NUMBER, (4, typeof(int))}, + {TableField.OL_I_ID, (4, typeof(int))}, + {TableField.OL_SUPPLY_W_ID, (4, typeof(int))}, + {TableField.OL_DELIVERY_D, (8, typeof(DateTime))}, + {TableField.OL_QUANTITY, (4, typeof(int))}, + {TableField.OL_AMOUNT, (4, typeof(float))}, + {TableField.OL_DIST_INFO, (24, typeof(string))}, + // {TableField.I_ID, (4, typeof(int))}, + {TableField.I_IM_ID, (4, typeof(int))}, + {TableField.I_NAME, (24, typeof(string))}, + {TableField.I_PRICE, (4, typeof(float))}, + {TableField.I_DATA, (50, typeof(string))}, + // {TableField.S_ID, (4, typeof(int))}, + // {TableField.S_W_ID, (8, typeof(long))}, + {TableField.S_QUANTITY, (4, typeof(int))}, + {TableField.S_DIST_01, (24, typeof(string))}, + {TableField.S_DIST_02, (24, typeof(string))}, + {TableField.S_DIST_03, (24, typeof(string))}, + {TableField.S_DIST_04, (24, typeof(string))}, + {TableField.S_DIST_05, (24, typeof(string))}, + {TableField.S_DIST_06, (24, typeof(string))}, + {TableField.S_DIST_07, (24, typeof(string))}, + {TableField.S_DIST_08, (24, typeof(string))}, + {TableField.S_DIST_09, (24, typeof(string))}, + {TableField.S_DIST_10, (24, typeof(string))}, + {TableField.S_YTD, (4, typeof(int))}, + {TableField.S_ORDER_CNT, (4, typeof(int))}, + {TableField.S_REMOTE_CNT, (4, typeof(int))}, + {TableField.S_DATA, (50, typeof(string))} + }; + + public static (long, int)[] WAREHOUSE_SCHEMA = tablesToFields[TableType.Warehouse].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] DISTRICT_SCHEMA = tablesToFields[TableType.District].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] CUSTOMER_SCHEMA = tablesToFields[TableType.Customer].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] HISTORY_SCHEMA = tablesToFields[TableType.History].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] ORDER_SCHEMA = tablesToFields[TableType.Order].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] NEW_ORDER_SCHEMA = tablesToFields[TableType.NewOrder].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] ITEM_SCHEMA = tablesToFields[TableType.Item].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] ORDER_LINE_SCHEMA = tablesToFields[TableType.OrderLine].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + public static (long, int)[] STOCK_SCHEMA = tablesToFields[TableType.Stock].Select(f => ((long)f, fieldsToSchema[f].Item1)).ToArray(); + + public static Func customerBuildTempPk = secondaryKey => new PrimaryKey((int)TableType.Customer, BitConverter.ToInt32(secondaryKey[0..sizeof(int)])); +} + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TpccBenchmark/FastRandom.cs b/cs/research/darq/DistributedTransactions/TpccBenchmark/FastRandom.cs new file mode 100644 index 000000000..e061b78c7 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TpccBenchmark/FastRandom.cs @@ -0,0 +1,360 @@ +/* *************************************************************************** + * This file is part of SharpNEAT - Evolution of Neural Networks. + * + * Copyright 2004-2006, 2009-2010 Colin Green (sharpneat@gmail.com) + * + * SharpNEAT is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SharpNEAT is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with SharpNEAT. If not, see . + */ + +// ENHANCEMENT: Replace usages of this class with the superceding version from Math.Net. +using System; + +namespace SharpNeat.Utility +{ + /// + /// A fast random number generator for .NET + /// Colin Green, January 2005 + /// + /// Key points: + /// 1) Based on a simple and fast xor-shift pseudo random number generator (RNG) specified in: + /// Marsaglia, George. (2003). Xorshift RNGs. + /// http://www.jstatsoft.org/v08/i14/xorshift.pdf + /// + /// This particular implementation of xorshift has a period of 2^128-1. See the above paper to see + /// how this can be easily extened if you need a longer period. At the time of writing I could find no + /// information on the period of System.Random for comparison. + /// + /// 2) Faster than System.Random. Up to 8x faster, depending on which methods are called. + /// + /// 3) Direct replacement for System.Random. This class implements all of the methods that System.Random + /// does plus some additional methods. The like named methods are functionally equivalent. + /// + /// 4) Allows fast re-initialisation with a seed, unlike System.Random which accepts a seed at construction + /// time which then executes a relatively expensive initialisation routine. This provides a vast speed improvement + /// if you need to reset the pseudo-random number sequence many times, e.g. if you want to re-generate the same + /// sequence of random numbers many times. An alternative might be to cache random numbers in an array, but that + /// approach is limited by memory capacity and the fact that you may also want a large number of different sequences + /// cached. Each sequence can be represented by a single seed value (int) when using FastRandom. + /// + /// Notes. + /// A further performance improvement can be obtained by declaring local variables as static, thus avoiding + /// re-allocation of variables on each call. However care should be taken if multiple instances of + /// FastRandom are in use or if being used in a multi-threaded environment. + /// + /// + /// Colin Green, September 4th 2005 + /// Added NextBytesUnsafe() - commented out by default. + /// Fixed bug in Reinitialise() - y,z and w variables were not being reset. + /// + /// + /// Colin Green, December 2008. + /// Fix to Next() - Was previously able to return int.MaxValue, contrary to the method's contract and comments. + /// Modified NextBool() to use _bitMask instead of a count of remaining bits. Also reset the bit buffer in Reinitialise(). + /// + public class FastRandom + { + // The +1 ensures NextDouble doesn't generate 1.0 + const double REAL_UNIT_INT = 1.0/((double)int.MaxValue+1.0); + const double REAL_UNIT_UINT = 1.0/((double)uint.MaxValue+1.0); + const uint Y=842502087, Z=3579807591, W=273326509; + + uint _x, _y, _z, _w; + + #region Constructors + + /// + /// Initialises a new instance using time dependent seed. + /// + public FastRandom() + { + // Initialise using the system tick count. + Reinitialise((int)Environment.TickCount); + } + + /// + /// Initialises a new instance using an int value as seed. + /// This constructor signature is provided to maintain compatibility with + /// System.Random + /// + public FastRandom(int seed) + { + Reinitialise(seed); + } + + #endregion + + #region Public Methods [Reinitialisation] + + /// + /// Reinitialises using an int value as a seed. + /// + public void Reinitialise(int seed) + { + // The only stipulation stated for the xorshift RNG is that at least one of + // the seeds x,y,z,w is non-zero. We fulfill that requirement by only allowing + // resetting of the x seed + _x = (uint)seed; + _y = Y; + _z = Z; + _w = W; + + _bitBuffer = 0; + _bitMask=1; + } + + #endregion + + #region Public Methods [System.Random functionally equivalent methods] + + /// + /// Generates a random int over the range 0 to int.MaxValue-1. + /// MaxValue is not generated in order to remain functionally equivalent to System.Random.Next(). + /// This does slightly eat into some of the performance gain over System.Random, but not much. + /// For better performance see: + /// + /// Call NextInt() for an int over the range 0 to int.MaxValue. + /// + /// Call NextUInt() and cast the result to an int to generate an int over the full Int32 value range + /// including negative values. + /// + public int Next() + { + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + _w = (_w^(_w>>19))^(t^(t>>8)); + + // Handle the special case where the value int.MaxValue is generated. This is outside of + // the range of permitted values, so we therefore call Next() to try again. + uint rtn = _w&0x7FFFFFFF; + if(rtn==0x7FFFFFFF) { + return Next(); + } + return (int)rtn; + } + + /// + /// Generates a random int over the range 0 to upperBound-1, and not including upperBound. + /// + public int Next(int upperBound) + { + if(upperBound<0) { + throw new ArgumentOutOfRangeException("upperBound", upperBound, "upperBound must be >=0"); + } + + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + + // The explicit int cast before the first multiplication gives better performance. + // See comments in NextDouble. + return (int)((REAL_UNIT_INT*(int)(0x7FFFFFFF&(_w=(_w^(_w>>19))^(t^(t>>8)))))*upperBound); + } + + /// + /// Generates a random int over the range lowerBound to upperBound-1, and not including upperBound. + /// upperBound must be >= lowerBound. lowerBound may be negative. + /// + public int Next(int lowerBound, int upperBound) + { + if(lowerBound>upperBound) { + throw new ArgumentOutOfRangeException("upperBound", upperBound, "upperBound must be >=lowerBound"); + } + + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + + // The explicit int cast before the first multiplication gives better performance. + // See comments in NextDouble. + int range = upperBound-lowerBound; + if(range<0) + { // If range is <0 then an overflow has occured and must resort to using long integer arithmetic instead (slower). + // We also must use all 32 bits of precision, instead of the normal 31, which again is slower. + return lowerBound+(int)((REAL_UNIT_UINT*(double)(_w=(_w^(_w>>19))^(t^(t>>8))))*(double)((long)upperBound-(long)lowerBound)); + } + + // 31 bits of precision will suffice if range<=int.MaxValue. This allows us to cast to an int and gain + // a little more performance. + return lowerBound+(int)((REAL_UNIT_INT*(double)(int)(0x7FFFFFFF&(_w=(_w^(_w>>19))^(t^(t>>8)))))*(double)range); + } + + /// + /// Generates a random double. Values returned are from 0.0 up to but not including 1.0. + /// + public double NextDouble() + { + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + + // Here we can gain a 2x speed improvement by generating a value that can be cast to + // an int instead of the more easily available uint. If we then explicitly cast to an + // int the compiler will then cast the int to a double to perform the multiplication, + // this final cast is a lot faster than casting from a uint to a double. The extra cast + // to an int is very fast (the allocated bits remain the same) and so the overall effect + // of the extra cast is a significant performance improvement. + // + // Also note that the loss of one bit of precision is equivalent to what occurs within + // System.Random. + return REAL_UNIT_INT*(int)(0x7FFFFFFF&(_w=(_w^(_w>>19))^(t^(t>>8)))); + } + + /// + /// Fills the provided byte array with random bytes. + /// This method is functionally equivalent to System.Random.NextBytes(). + /// + public void NextBytes(byte[] buffer) + { + // Fill up the bulk of the buffer in chunks of 4 bytes at a time. + uint x=this._x, y=this._y, z=this._z, w=this._w; + int i=0; + uint t; + for(int bound=buffer.Length-3; i>19))^(t^(t>>8)); + + buffer[i++] = (byte)w; + buffer[i++] = (byte)(w>>8); + buffer[i++] = (byte)(w>>16); + buffer[i++] = (byte)(w>>24); + } + + // Fill up any remaining bytes in the buffer. + if(i>19))^(t^(t>>8)); + + buffer[i++] = (byte)w; + if(i>8); + if(i>16); + if(i>24); + } + } + } + } + this._x=x; this._y=y; this._z=z; this._w=w; + } + + ///// + ///// A version of NextBytes that uses a pointer to set 4 bytes of the byte buffer in one operation + ///// thus providing a nice speedup. The loop is also partially unrolled to allow out-of-order-execution, + ///// this results in about a x2 speedup on an AMD Athlon. Thus performance may vary wildly on different CPUs + ///// depending on the number of execution units available. + ///// + ///// Another significant speedup is obtained by setting the 4 bytes by indexing pDWord (e.g. pDWord[i++]=_w) + ///// instead of dereferencing it (e.g. *pDWord++=_w). + ///// + ///// Note that this routine requires the unsafe compilation flag to be specified and so is commented out by default. + ///// + ///// +// public unsafe void NextBytesUnsafe(byte[] buffer) +// { +// if(buffer.Length % 8 != 0) +// throw new ArgumentException("Buffer length must be divisible by 8", "buffer"); +// +// uint _x=this._x, _y=this._y, _z=this._z, _w=this._w; +// +// fixed(byte* pByte0 = buffer) +// { +// uint* pDWord = (uint*)pByte0; +// for(int i=0, len=buffer.Length>>2; i < len; i+=2) +// { +// uint t=(_x^(_x<<11)); +// _x=_y; _y=_z; _z=_w; +// pDWord[i] = _w = (_w^(_w>>19))^(t^(t>>8)); +// +// t=(_x^(_x<<11)); +// _x=_y; _y=_z; _z=_w; +// pDWord[i+1] = _w = (_w^(_w>>19))^(t^(t>>8)); +// } +// } +// +// this._x=_x; this._y=_y; this._z=_z; this._w=_w; +// } + #endregion + + #region Public Methods [Methods not present on System.Random] + + /// + /// Generates a uint. Values returned are over the full range of a uint, + /// uint.MinValue to uint.MaxValue, inclusive. + /// + /// This is the fastest method for generating a single random number because the underlying + /// random number generator algorithm generates 32 random bits that can be cast directly to + /// a uint. + /// + public uint NextUInt() + { + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + return _w=(_w^(_w>>19))^(t^(t>>8)); + } + + /// + /// Generates a random int over the range 0 to int.MaxValue, inclusive. + /// This method differs from Next() only in that the range is 0 to int.MaxValue + /// and not 0 to int.MaxValue-1. + /// + /// The slight difference in range means this method is slightly faster than Next() + /// but is not functionally equivalent to System.Random.Next(). + /// + public int NextInt() + { + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + return (int)(0x7FFFFFFF&(_w=(_w^(_w>>19))^(t^(t>>8)))); + } + + // Buffer 32 bits in bitBuffer, return 1 at a time, keep track of how many have been returned + // with bitMask. + uint _bitBuffer; + uint _bitMask; + + /// + /// Generates a single random bit. + /// This method's performance is improved by generating 32 bits in one operation and storing them + /// ready for future calls. + /// + public bool NextBool() + { + if(_bitMask==0) + { + // Generate 32 more bits. + uint t = _x^(_x<<11); + _x=_y; _y=_z; _z=_w; + _bitBuffer=_w=(_w^(_w>>19))^(t^(t>>8)); + + // Reset the bitMask that tells us which bit to read next. + _bitMask = 0x80000000; + return (_bitBuffer & _bitMask)==0; + } + + return (_bitBuffer & (_bitMask>>=1))==0; + } + + #endregion + } +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TpccBenchmark/TableBenchmark.cs b/cs/research/darq/DistributedTransactions/TpccBenchmark/TableBenchmark.cs new file mode 100644 index 000000000..84219629f --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TpccBenchmark/TableBenchmark.cs @@ -0,0 +1,269 @@ +using System; +using System.Threading; +using System.Diagnostics; +using System.Collections; + +namespace DB { + +public struct BenchmarkConfig { + public int seed; + public double ratio; + public int threadCount; + public int insertThreadCount; + public int attrCount; + public int perThreadDataCount; // enough to be larger than l3 cache + public int iterationCount; + public int datasetSize; + public int perTransactionCount; + public int nCommitterThreads; + + public BenchmarkConfig( + int seed = 12345, + double ratio = 0.2, + int threadCount = 16, + int insertThreadCount = -1, + int attrCount = 2, + int perThreadDataCount = 100000, + int iterationCount = 10, + int perTransactionCount = 1, + int nCommitterThreads = 7){ + + this.seed = seed; + this.ratio = ratio; + this.threadCount = threadCount; + this.insertThreadCount = insertThreadCount == -1 ? threadCount : insertThreadCount; + this.attrCount = attrCount; + this.perThreadDataCount = perThreadDataCount; + this.iterationCount = iterationCount; + this.perTransactionCount = perTransactionCount; + this.datasetSize = perThreadDataCount * threadCount; + this.nCommitterThreads = nCommitterThreads; + } + + public override readonly string ToString(){ + return $"seed: {seed}, ratio: {ratio}, threadCount: {threadCount}, attrCount: {attrCount}, perThreadDataCount: {perThreadDataCount}, iterationCount: {iterationCount}, perTransactionCount: {perTransactionCount}, datasetSize: {datasetSize}, nCommitterThreads: {nCommitterThreads}"; + } +} + +public abstract class TableBenchmark +{ + protected internal BenchmarkConfig cfg; + protected internal PrimaryKey[] keys; + protected internal byte[][] values; + protected internal BitArray isWrite; + // TODO: in the future support multiple tables, make this list + protected internal TupleDesc[] td; // TODO: remove, this is the same as schema + protected internal Thread[] workers; + protected internal BenchmarkStatistics? stats; + protected internal IWriteAheadLog? wal; + + public TableBenchmark(BenchmarkConfig cfg, IWriteAheadLog? wal = null){ + this.cfg = cfg; + this.wal = wal; + td = new TupleDesc[cfg.attrCount]; + workers = new Thread[Math.Max(cfg.insertThreadCount, cfg.threadCount)]; + + keys = new PrimaryKey[cfg.datasetSize]; + values = new byte[cfg.datasetSize][]; + isWrite = new BitArray(cfg.datasetSize); + } + + virtual protected internal int InsertSingleThreadedTransactions(Table tbl, TransactionManager txnManager, int thread_idx){ + int abortCount = 0; + int c = 0; + Debug.Assert(cfg.threadCount == cfg.insertThreadCount, "Insert thread count must be equal to thread count"); + for (int i = 0; i < cfg.perThreadDataCount; i += cfg.perTransactionCount){ + TransactionContext t = txnManager.Begin(); + for (int j = 0; j < cfg.perTransactionCount; j++) { + int loc = i + j + (cfg.perThreadDataCount * thread_idx); + PrimaryKey tupleId = tbl.Insert(values[loc], t); + keys[loc] = tupleId; + } + var success = txnManager.Commit(t); + if (!success){ + abortCount++; + } else { + c++; + } + } + return abortCount; + } + + protected internal int InsertMultiThreadedTransactions(Table tbl, TransactionManager txnManager) + { + int totalAborts = 0; + for (int thread = 0; thread < cfg.insertThreadCount; thread++) { + int t = thread; + workers[thread] = new Thread(() => { + int aborts = InsertSingleThreadedTransactions(tbl, txnManager, t); + Interlocked.Add(ref totalAborts, aborts); + }); + workers[thread].Start(); + } + for (int thread = 0; thread < cfg.insertThreadCount; thread++) { + workers[thread].Join(); + } + return totalAborts; + } + + virtual protected internal int WorkloadSingleThreadedTransactions(Table tbl, TransactionManager txnManager, int thread_idx, double ratio){ + int abortCount = 0; + for (int i = 0; i < cfg.perThreadDataCount; i += cfg.perTransactionCount){ + TransactionContext t = txnManager.Begin(); + for (int j = 0; j < cfg.perTransactionCount; j++){ + int loc = i + j + (cfg.perThreadDataCount * thread_idx); + PrimaryKey key = keys[loc]; + // uncomment to make workload only insert one attribute instead of all + // long attr = schema[loc%cfg.attrCount].Item1; + // TupleDesc[] td = new TupleDesc[]{new TupleDesc(attr, tbl.metadata[attr].Item1)}; + + if (isWrite[loc]) { + // shift value by thread_idx to write new value + int newValueIndex = loc + thread_idx < values.Length ? loc + thread_idx : values.Length - 1; + // Span val = new Span(values[newValueIndex]).Slice(0, sizeof(long)); + byte[] val = values[newValueIndex]; + tbl.Update(ref key, td, val, t); + } else { + tbl.Read(key, td, t); + } + } + var success = txnManager.Commit(t); + if (!success){ + abortCount++; + } + } + return abortCount; + } + + protected internal int WorkloadMultiThreadedTransactions(Table tbl, TransactionManager txnManager, double ratio) + { + int totalAborts = 0; + for (int thread = 0; thread < cfg.threadCount; thread++) { + int t = thread; + workers[thread] = new Thread(() => { + int aborts = WorkloadSingleThreadedTransactions(tbl, txnManager, t, ratio); + Interlocked.Add(ref totalAborts, aborts); + }); + workers[thread].Start(); + } + for (int thread = 0; thread < cfg.threadCount; thread++) { + workers[thread].Join(); + } + return totalAborts; + } + + virtual public void RunTransactions(){ + for (int i = 0; i < cfg.iterationCount; i++){ + (long, int)[] schema = new (long, int)[cfg.attrCount]; + for (int j = 0; j < td.Length; j++){ + schema[j] = (td[j].Attr, td[j].Size); + } + using (Table tbl = new Table(0, schema)) { + Dictionary tables = new Dictionary(); + tables.Add(tbl.GetId(), tbl); + TransactionManager txnManager = new TransactionManager(cfg.nCommitterThreads, tables, wal); + txnManager.Run(); + var insertSw = Stopwatch.StartNew(); + int insertAborts = InsertMultiThreadedTransactions(tbl, txnManager); // setup + insertSw.Stop(); + System.Console.WriteLine("done inserting"); + long insertMs = insertSw.ElapsedMilliseconds; + var opSw = Stopwatch.StartNew(); + int txnAborts = WorkloadMultiThreadedTransactions(tbl, txnManager, cfg.ratio); + opSw.Stop(); + long opMs = opSw.ElapsedMilliseconds; + stats?.AddTransactionalResult((insertMs, opMs, insertAborts, txnAborts)); + txnManager.Terminate(); + } + } + stats?.ShowAllStats(); + stats?.SaveStatsToFile(); + } +} + +public class BenchmarkStatistics { + internal readonly List insMsPerRun = new List(); + internal readonly List opsMsPerRun = new List(); + internal readonly List insAbortsPerRun = new List(); + internal readonly List txnAbortsPerRun = new List(); + internal string name; + internal BenchmarkConfig cfg; + internal TpccConfig? tpcCfg; + internal int inserts; + internal int operations; + + internal BenchmarkStatistics(string name, BenchmarkConfig cfg, int inserts, int operations, TpccConfig? tpcCfg = null) + { + this.cfg = cfg; + this.tpcCfg = tpcCfg; + this.name = name; + this.inserts = inserts; + this.operations = operations; + } + + internal void AddResult((long ims, long oms) result) + { + insMsPerRun.Add(result.ims); + opsMsPerRun.Add(result.oms); + } + + internal void AddTransactionalResult((long ims, long oms, int insAborts, int txnAborts) result) + { + insMsPerRun.Add(result.ims); + opsMsPerRun.Add(result.oms); + insAbortsPerRun.Add(result.insAborts); + txnAbortsPerRun.Add(result.txnAborts); + } + + internal string[] GetStats() { + string[] data = new string[]{ + $"Benchmark {name}", + "-----BENCHMARK CONFIG-----", + cfg.ToString(), + "-----TPCC CONFIG-----", + tpcCfg?.ToString(), + "-----STATS-----", + GetInsDataString(operations, insMsPerRun), + GetOpsDataString(inserts, operations-inserts, opsMsPerRun, txnAbortsPerRun) + }; + + if (insAbortsPerRun.Count != 0) { + data = data.Concat(new string[]{ + GetInsAbortDataString(insAbortsPerRun), + GetTxnAbortDataString(txnAbortsPerRun) + }).ToArray(); + } + + return data; + } + + internal void ShowAllStats(){ + foreach (string line in GetStats()){ + Console.WriteLine(line); + } + } + + internal async void SaveStatsToFile(){ + string now = DateTime.Now.ToString("yyyyMMdd-HHmmss"); + await File.WriteAllLinesAsync($"benchmark/benchmarkResults/{name}-{now}.txt", GetStats()); + } + + internal string GetOpsDataString(int inserts, int reads, List opsMsPerRun, List txnAborts) => $"{(inserts+reads-txnAborts.Average())/opsMsPerRun.Average()} successful operations/ms {(inserts+reads)/opsMsPerRun.Average()} operations/ms ({inserts+reads} operations ({inserts} inserts, {reads} reads) in {opsMsPerRun.Average()} ms)"; + internal string GetInsDataString(int inserts, List insMsPerRun) => $"{inserts/insMsPerRun.Average()} inserts/ms ({inserts} inserts in {insMsPerRun.Average()} ms)"; + internal string GetTxnAbortDataString(List txnAborts) => $"Operations: Average {txnAborts.Average()} aborts out of {cfg.datasetSize/cfg.perTransactionCount} transactions ({txnAborts.Average()/(cfg.datasetSize/cfg.perTransactionCount)*100}% abort rate)"; + internal string GetInsAbortDataString(List insAborts) => $"Insertions: Average {insAborts.Average()} aborts out of {cfg.datasetSize/cfg.perTransactionCount} transactions ({insAborts.Average()/(cfg.datasetSize/cfg.perTransactionCount)*100}% abort rate)"; + + // internal static string GetLoadingTimeLine(double insertsPerSec, long elapsedMs) + // => $"##00; {InsPerSec}: {insertsPerSec:N2}; sec: {(double)elapsedMs / 1000:N3}"; + + // internal static string GetAddressesLine(AddressLineNum lineNum, long begin, long head, long rdonly, long tail) + // => $"##{(int)lineNum:00}; begin: {begin}; head: {head}; readonly: {rdonly}; tail: {tail}"; + + // internal static string GetStatsLine(StatsLineNum lineNum, string opsPerSecTag, double opsPerSec) + // => $"##{(int)lineNum:00}; {opsPerSecTag}: {opsPerSec:N2}; {OptionsString}"; + + // internal static string GetStatsLine(StatsLineNum lineNum, string meanTag, double mean, double stdev, double stdevpct) + // => $"##{(int)lineNum:00}; {meanTag}: {mean:N2}; stdev: {stdev:N1}; stdev%: {stdevpct:N1}; {OptionsString}"; +} + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TpccBenchmark/TpccBenchmark.cs b/cs/research/darq/DistributedTransactions/TpccBenchmark/TpccBenchmark.cs new file mode 100644 index 000000000..7cf4d2ccd --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TpccBenchmark/TpccBenchmark.cs @@ -0,0 +1,1291 @@ +using System.Collections.Concurrent; +using System.Text; +using System.Diagnostics; +using SharpNeat.Utility; +using FASTER.common; + +namespace DB { +public struct TpccConfig { + public int NumWh; + public int NumDistrict; + public int NumCustomer; + public int NumOrder; + public int NumItem; + public int NumStock; + public int NewOrderCrossPartitionProbability; + public int PaymentCrossPartitionProbability; + public int PartitionsPerThread; + + public TpccConfig( + int numWh = 2, + int numDistrict = 10, + int numCustomer = 3000, + int numOrder = 3000, + int numItem = 100000, + int numStock = 100000, + int newOrderCrossPartitionProbability = 10, + int paymentCrossPartitionProbability = 15, + int partitionsPerThread = 4 + ){ + NumWh = numWh; + NumDistrict = numDistrict; + NumCustomer = numCustomer; + NumOrder = numOrder; + NumItem = numItem; + NumStock = numStock; + NewOrderCrossPartitionProbability = newOrderCrossPartitionProbability; + PaymentCrossPartitionProbability = paymentCrossPartitionProbability; + PartitionsPerThread = partitionsPerThread; + } + + public override string ToString() + { + return $"NumWh: {NumWh}, NumDistrict: {NumDistrict}, NumCustomer: {NumCustomer}, NumOrder: {NumOrder}, NumItem: {NumItem}, NumStock: {NumStock}, NewOrderCrossPartitionProbability: {NewOrderCrossPartitionProbability}, PaymentCrossPartitionProbability: {PaymentCrossPartitionProbability}, PartitionsPerThread: {PartitionsPerThread}"; + } +} + +public struct Query { + + public int w_id; + public int d_id; + public int c_id; + public int c_d_id; + public int c_w_id; + public float h_amount; + public byte[] c_last; + public int o_ol_cnt; + public int[] ol_i_ids; + public int[] ol_supply_w_id; + public int[] ol_quantity; + public bool isNewOrder; + + public Query(int w_id, int d_id, int c_id, int o_ol_cnt, int[] ol_i_ids, int[] ol_supply_w_id, int[] ol_quantity) { + isNewOrder = true; + this.w_id = w_id; + this.d_id = d_id; + this.c_id = c_id; + this.o_ol_cnt = o_ol_cnt; + this.ol_i_ids = ol_i_ids; + this.ol_supply_w_id = ol_supply_w_id; + this.ol_quantity = ol_quantity; + } + + public Query(int w_id, int d_id, int c_id, int c_d_id, int c_w_id, float h_amount, byte[] c_last) { + isNewOrder = false; + this.w_id = w_id; + this.d_id = d_id; + this.c_id = c_id; + this.c_d_id = c_d_id; + this.c_w_id = c_w_id; + this.h_amount = h_amount; + this.c_last = c_last; + } +} + +// public struct NewOrderQuery : Query { +// public int w_id; +// public int d_id; +// public int c_id; +// public int o_ol_cnt; +// public int[] ol_i_ids; +// public int[] ol_supply_w_id; +// public int[] ol_quantity; +// public NewOrderQuery(int w_id, int d_id, int c_id, int o_ol_cnt, int[] ol_i_ids, int[] ol_supply_w_id, int[] ol_quantity){ +// this.w_id = w_id; +// this.d_id = d_id; +// this.c_id = c_id; +// this.o_ol_cnt = o_ol_cnt; +// this.ol_i_ids = ol_i_ids; +// this.ol_supply_w_id = ol_supply_w_id; +// this.ol_quantity = ol_quantity; +// } + +// public unsafe byte[] ToBytes(){ +// byte[] bytes = new byte[4 + 4 + 4 + 4 + 4 + 4 + 4 * ol_i_ids.Length + 4 * ol_supply_w_id.Length + 4 * ol_quantity.Length]; +// fixed (byte* b = bytes){ +// *(int*)b = w_id; +// *(int*)(b + 4) = d_id; +// *(int*)(b + 8) = c_id; +// *(int*)(b + 12) = o_ol_cnt; +// *(int*)(b + 16) = ol_i_ids.Length; +// for (int i = 0; i < ol_i_ids.Length; i++){ +// *(int*)(b + 20 + i * 4) = ol_i_ids[i]; +// } +// *(int*)(b + 20 + ol_i_ids.Length * 4) = ol_supply_w_id.Length; +// for (int i = 0; i < ol_supply_w_id.Length; i++){ +// *(int*)(b + 24 + ol_i_ids.Length * 4 + i * 4) = ol_supply_w_id[i]; +// } +// *(int*)(b + 24 + ol_i_ids.Length * 4 + ol_supply_w_id.Length * 4) = ol_quantity.Length; +// for (int i = 0; i < ol_quantity.Length; i++){ +// *(int*)(b + 28 + ol_i_ids.Length * 4 + ol_supply_w_id.Length * 4 + i * 4) = ol_quantity[i]; +// } +// } +// return bytes; +// } + +// public static unsafe NewOrderQuery FromBytes(byte[] bytes){ +// int w_id, d_id, c_id, o_ol_cnt; +// int[] ol_i_ids, ol_supply_w_id, ol_quantity; +// fixed (byte* b = bytes){ +// w_id = *(int*)b; +// d_id = *(int*)(b + 4); +// c_id = *(int*)(b + 8); +// o_ol_cnt = *(int*)(b + 12); +// int ol_i_ids_len = *(int*)(b + 16); +// ol_i_ids = new int[ol_i_ids_len]; +// for (int i = 0; i < ol_i_ids_len; i++){ +// ol_i_ids[i] = *(int*)(b + 20 + i * 4); +// } +// int ol_supply_w_id_len = *(int*)(b + 20 + ol_i_ids_len * 4); +// ol_supply_w_id = new int[ol_supply_w_id_len]; +// for (int i = 0; i < ol_supply_w_id_len; i++){ +// ol_supply_w_id[i] = *(int*)(b + 24 + ol_i_ids_len * 4 + i * 4); +// } +// int ol_quantity_len = *(int*)(b + 24 + ol_i_ids_len * 4 + ol_supply_w_id_len * 4); +// ol_quantity = new int[ol_quantity_len]; +// for (int i = 0; i < ol_quantity_len; i++){ +// ol_quantity[i] = *(int*)(b + 28 + ol_i_ids_len * 4 + ol_supply_w_id_len * 4 + i * 4); +// } +// } +// return new NewOrderQuery(w_id, d_id, c_id, o_ol_cnt, ol_i_ids, ol_supply_w_id, ol_quantity); +// } +// } + +// public struct PaymentQuery { +// public int w_id; +// public int d_id; +// public int c_id; +// public int c_d_id; +// public int c_w_id; +// public float h_amount; +// public string c_last; +// public static int Size = 4 + 4 + 4 + 4 + 4 + 4 + 16; +// public PaymentQuery(int w_id, int d_id, int c_id, int c_d_id, int c_w_id, float h_amount, string c_last){ +// this.w_id = w_id; +// this.d_id = d_id; +// this.c_id = c_id; +// this.c_d_id = c_d_id; +// this.c_w_id = c_w_id; +// this.h_amount = h_amount; +// this.c_last = c_last; +// } + +// // public unsafe byte[] ToBytes(){ +// // byte[] bytes = new byte[Size]; +// // fixed (byte* b = bytes){ +// // *(int*)b = w_id; +// // *(int*)(b + 4) = d_id; +// // *(int*)(b + 8) = c_id; +// // *(int*)(b + 12) = c_d_id; +// // *(int*)(b + 16) = c_w_id; +// // *(float*)(b + 20) = h_amount; +// // Encoding.ASCII.GetBytes(c_last).CopyTo(bytes, 24); +// // } +// // return bytes; +// // } + +// // public static unsafe PaymentQuery FromBytes(byte[] bytes){ +// // int w_id, d_id, c_id, c_d_id, c_w_id; +// // float h_amount; +// // string c_last; +// // fixed (byte* b = bytes){ +// // w_id = *(int*)b; +// // d_id = *(int*)(b + 4); +// // c_id = *(int*)(b + 8); +// // c_d_id = *(int*)(b + 12); +// // c_w_id = *(int*)(b + 16); +// // h_amount = *(float*)(b + 20); +// // c_last = Encoding.ASCII.GetString(bytes, 24, 16); +// // } +// // return new PaymentQuery(w_id, d_id, c_id, c_d_id, c_w_id, h_amount, c_last); +// // } + +// } + +/// +/// Adapted from https://github.com/SQLServerIO/TPCCBench/blob/master/TPCCDatabaseGenerator/TPCCGenData.cs +/// and coco +/// +public class TpccBenchmark : TableBenchmark { + + private static readonly FastRandom Frnd = new FastRandom(); + private static byte[] RandHold = Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"); + private static byte[] ZipRandHold = Encoding.ASCII.GetBytes("1234567890"); + private static byte SpaceAsByte = (byte)' '; + private static byte[] ZeroAsBytes = BitConverter.GetBytes(0); + private static byte[] BcAsBytes = { (byte)'B', (byte)'C' }; + private static byte[][] LastNames = new byte[][] { + Encoding.ASCII.GetBytes("BAR"), Encoding.ASCII.GetBytes("OUGHT"), Encoding.ASCII.GetBytes("ABLE"), Encoding.ASCII.GetBytes("PRI"), Encoding.ASCII.GetBytes("PRES"), + Encoding.ASCII.GetBytes("ESE"), Encoding.ASCII.GetBytes("ANTI"), Encoding.ASCII.GetBytes("CALLY"), Encoding.ASCII.GetBytes("ATION"), Encoding.ASCII.GetBytes("EING") + }; + + private static byte[] ORIGINAL = Encoding.ASCII.GetBytes("ORIGINAL"); + private static TupleDesc[] updateStockTds = new TupleDesc[] { + new TupleDesc((int)TableField.S_QUANTITY, TpccSchema.fieldsToSchema[TableField.S_QUANTITY].Item1, 0), + new TupleDesc((int)TableField.S_YTD, TpccSchema.fieldsToSchema[TableField.S_YTD].Item1, TpccSchema.fieldsToSchema[TableField.S_QUANTITY].Item1), + new TupleDesc((int)TableField.S_ORDER_CNT, TpccSchema.fieldsToSchema[TableField.S_ORDER_CNT].Item1, TpccSchema.fieldsToSchema[TableField.S_QUANTITY].Item1 + TpccSchema.fieldsToSchema[TableField.S_YTD].Item1), + new TupleDesc((int)TableField.S_REMOTE_CNT, TpccSchema.fieldsToSchema[TableField.S_REMOTE_CNT].Item1, TpccSchema.fieldsToSchema[TableField.S_QUANTITY].Item1 + TpccSchema.fieldsToSchema[TableField.S_YTD].Item1 + TpccSchema.fieldsToSchema[TableField.S_ORDER_CNT].Item1) + }; + + private static TupleDesc[] updateWarehouseTds = new TupleDesc[] { + new TupleDesc((int)TableField.W_YTD, TpccSchema.fieldsToSchema[TableField.W_YTD].Item1, 0) + }; + + private static TupleDesc[] updateDistrictYtdTds = new TupleDesc[] { + new TupleDesc((int)TableField.D_YTD, TpccSchema.fieldsToSchema[TableField.D_YTD].Item1, 0) + }; + + private static TupleDesc[] updateDistrictNextOIdTds = new TupleDesc[] { + new TupleDesc((int)TableField.D_NEXT_O_ID, TpccSchema.fieldsToSchema[TableField.D_NEXT_O_ID].Item1, 0) + }; + + private static TupleDesc[] updateCustomerTds = new TupleDesc[] { + new TupleDesc((int)TableField.C_BALANCE, TpccSchema.fieldsToSchema[TableField.C_BALANCE].Item1, 0), + new TupleDesc((int)TableField.C_YTD_PAYMENT, TpccSchema.fieldsToSchema[TableField.C_YTD_PAYMENT].Item1, TpccSchema.fieldsToSchema[TableField.C_BALANCE].Item1), + new TupleDesc((int)TableField.C_PAYMENT_CNT, TpccSchema.fieldsToSchema[TableField.C_PAYMENT_CNT].Item1, TpccSchema.fieldsToSchema[TableField.C_BALANCE].Item1 + TpccSchema.fieldsToSchema[TableField.C_YTD_PAYMENT].Item1), + new TupleDesc((int)TableField.C_DATA, TpccSchema.fieldsToSchema[TableField.C_DATA].Item1, TpccSchema.fieldsToSchema[TableField.C_BALANCE].Item1 + TpccSchema.fieldsToSchema[TableField.C_YTD_PAYMENT].Item1 + TpccSchema.fieldsToSchema[TableField.C_PAYMENT_CNT].Item1) + }; + + private static byte[] emptyByteArr = new byte[0]; + + Dictionary tableDataFiles = new Dictionary { + {TableType.Warehouse, "data/warehouseData_{0}.bin"}, + {TableType.District, "data/districtData_{0}.bin"}, + {TableType.Customer, "data/customerData_{0}.bin"}, + {TableType.History, "data/historyData_{0}.bin"}, + {TableType.Order, "data/orderData_{0}.bin"}, + {TableType.NewOrder, "data/newOrderData_{0}.bin"}, + {TableType.Item, "data/itemData.bin"}, + {TableType.OrderLine, "data/orderLineData_{0}.bin"}, + {TableType.Stock, "data/stockData_{0}.bin"} + }; + private int[][] ol_cnts; + private long[][] entry_ds; + + public int PartitionId; + private TpccConfig tpcCfg; + private Dictionary tables; + private ShardedTransactionManager txnManager; + Query[] queries; + private RpcClient rpcClient; + internal SimpleObjectPool orderBytePool; + internal SimpleObjectPool stockBytePool; + internal SimpleObjectPool orderLineBytePool; + internal SimpleObjectPool customerBytePool; + internal SimpleObjectPool historyBytePool; + CountdownEvent cde; + internal int[] successCounts; + // internal int[] abortCounts; + + public TpccBenchmark(int partitionId, TpccConfig tpcCfg, BenchmarkConfig cfg, Dictionary tables, ShardedTransactionManager txnManager) : base(cfg){ + System.Console.WriteLine("Init"); + PartitionId = partitionId; + this.tpcCfg = tpcCfg; + this.tables = tables; + this.txnManager = txnManager; + this.rpcClient = txnManager.GetRpcClient(); + orderBytePool = new SimpleObjectPool(() => new byte[tables[(int)TableType.Order].rowSize]); + // assume larger + int stockByteSize = TpccSchema.fieldsToSchema[TableField.S_YTD].Item1 + TpccSchema.fieldsToSchema[TableField.S_ORDER_CNT].Item1 + TpccSchema.fieldsToSchema[TableField.S_REMOTE_CNT].Item1 + TpccSchema.fieldsToSchema[TableField.S_QUANTITY].Item1; + stockBytePool = new SimpleObjectPool(() => new byte[stockByteSize]); + orderLineBytePool = new SimpleObjectPool(() => new byte[tables[(int)TableType.OrderLine].rowSize]); + // assume larger + int customerByteSize = TpccSchema.fieldsToSchema[TableField.C_BALANCE].Item1 + TpccSchema.fieldsToSchema[TableField.C_YTD_PAYMENT].Item1 + TpccSchema.fieldsToSchema[TableField.C_PAYMENT_CNT].Item1 + TpccSchema.fieldsToSchema[TableField.C_DATA].Item1; + customerBytePool = new SimpleObjectPool(() => new byte[customerByteSize]); + historyBytePool = new SimpleObjectPool(() => new byte[tables[(int)TableType.History].rowSize]); + cde = new CountdownEvent(cfg.threadCount * cfg.perThreadDataCount); + successCounts = new int[cfg.threadCount]; + // abortCounts = new int[cfg.threadCount]; + + Debug.Assert(cfg.threadCount <= tpcCfg.NumWh, "Thread count must be less than number of warehouses"); + Debug.Assert(tpcCfg.PartitionsPerThread * cfg.threadCount == tpcCfg.NumWh, "Partitions per thread * thread count must equal number of warehouses"); + + this.ol_cnts = new int[tpcCfg.NumDistrict][]; + for (int i = 0; i < tpcCfg.NumDistrict; i++){ + ol_cnts[i] = new int[tpcCfg.NumOrder]; + } + this.entry_ds = new long[tpcCfg.NumDistrict][]; + for (int i = 0; i < tpcCfg.NumDistrict; i++){ + entry_ds[i] = new long[tpcCfg.NumOrder]; + } + + int numNewOrders = GenerateQueryData(); + + stats = new BenchmarkStatistics($"TpccBenchmark", cfg, numNewOrders, cfg.datasetSize, tpcCfg); + System.Console.WriteLine("Done init"); + } + + private Query GenerateNewOrderQuery(int w_id, int i){ + int rbk = Frnd.Next(1, 100); + int o_ol_cnt = Frnd.Next(5, 15); + int[] ol_i_ids = new int[o_ol_cnt]; + int[] ol_supply_w_id = new int[o_ol_cnt]; + int[] ol_quantity = new int[o_ol_cnt]; + for (int j = 0; j < o_ol_cnt; j++){ + // TODO: make not the same between machines + ol_i_ids[j] = (i / cfg.perThreadDataCount) * (tpcCfg.NumItem / cfg.threadCount) + ((i * o_ol_cnt + j) % (tpcCfg.NumItem / cfg.threadCount)) + 1; + // bool retry; + // do { + // retry = false; + // ol_i_ids[j] = NonUniformRandom(8191, 1, 100000); + // for (int k = 0; k < j; k++){ + // if (ol_i_ids[j] == ol_i_ids[k]){ + // retry = true; + // break; + // } + // } + // } while (retry); + + // if (j == o_ol_cnt - 1 && rbk == 1){ + // ol_i_ids[j] = 0; + // } + + ol_supply_w_id[j] = w_id; + if (j == 0) { + int x = Frnd.Next(1, 100); + if (x <= tpcCfg.NewOrderCrossPartitionProbability) { + while (ol_supply_w_id[j] == w_id) { + ol_supply_w_id[j] = Frnd.Next(1, tpcCfg.NumWh + 1); + } + } + } + ol_quantity[j] = Frnd.Next(1, 10); + } + return new Query( + w_id, + Frnd.Next(1, tpcCfg.NumDistrict + 1), + NonUniformRandom(1023, 1, 3000), + o_ol_cnt, + ol_i_ids, + ol_supply_w_id, + ol_quantity + ); + } + + public Query GeneratePaymentQuery(int w_id, int i) { + int d_id = Frnd.Next(1, tpcCfg.NumDistrict + 1); + int c_w_id = w_id; + int c_d_id; + int x = Frnd.Next(1, 100); + if (x <= tpcCfg.PaymentCrossPartitionProbability) { + while (c_w_id == w_id) { + c_w_id = Frnd.Next(1, tpcCfg.NumWh + 1); + } + c_d_id = Frnd.Next(1, tpcCfg.NumDistrict + 1); + } else { + c_d_id = d_id; + } + + int y = Frnd.Next(1, 100); + byte[] c_last; + int c_id; + if (y <= 60) { + // c_last = RandLastName(0); // or testing small num + c_last = RandLastName(NonUniformRandom(255, 0, 999)); + c_id = 0; + } else { + c_id = NonUniformRandom(1023, 1, 3000); + c_last = null; + } + return new Query( + w_id, + d_id, + c_id, + c_d_id, + c_w_id, + Frnd.Next(1, 5000), + c_last + ); + } + + public int GenerateQueryData() { + this.queries = new Query[cfg.datasetSize]; + int numNewOrders = 0; + int partitionsPerMachine = tpcCfg.PartitionsPerThread * cfg.threadCount; + for (int i = 0; i < queries.Length; i++){ + int thread_idx = i / cfg.perThreadDataCount; + // int w_id = Frnd.Next((partitionId * tpcCfg.PartitionsPerThread) + 1, (partitionId * tpcCfg.PartitionsPerThread) + 1 + tpcCfg.PartitionsPerThread); + int w_id = (PartitionId * partitionsPerMachine) + thread_idx * tpcCfg.PartitionsPerThread + (i % tpcCfg.PartitionsPerThread) + 1; + // randomly assign NewOrder vs Payment + if (Frnd.Next(1, 100) <= 50){ + numNewOrders++; + queries[i] = GenerateNewOrderQuery(w_id, i); + } else { + queries[i] = GeneratePaymentQuery(w_id, i); + } + } + return numNewOrders; + } + + public unsafe void NewOrder(Query query, Action callback){ + TransactionContext ctx = txnManager.Begin(); + ReadOnlySpan warehouseRow = tables[(int)TableType.Warehouse].Read(new PrimaryKey((int)TableType.Warehouse, query.w_id), tables[(int)TableType.Warehouse].GetSchema(), ctx); + PrimaryKey districtPk = new PrimaryKey((int)TableType.District, query.w_id, query.d_id); + ReadOnlySpan districtRow = tables[(int)TableType.District].Read(districtPk, tables[(int)TableType.District].GetSchema(), ctx); + ReadOnlySpan customerRow = tables[(int)TableType.Customer].Read(new PrimaryKey((int)TableType.Customer, query.w_id, query.d_id, query.c_id), tables[(int)TableType.Customer].GetSchema(), ctx); + + bool allLocal = true; + for (int i = 0; i < query.o_ol_cnt; i++) + { + if (query.ol_i_ids[i] == 0) { + txnManager.Abort(ctx, callback); + return; + } + if (query.ol_supply_w_id[i] != query.w_id) allLocal = false; + } + // update district with increment D_NEXT_O_ID + ReadOnlySpan old_d_next_o_id_bytes = ExtractField(TableType.District, TableField.D_NEXT_O_ID, districtRow); + int old_d_next_o_id = BitConverter.ToInt32(old_d_next_o_id_bytes); + int new_d_next_o_id = old_d_next_o_id + 1; + ReadOnlySpan new_d_next_o_id_bytes = new ReadOnlySpan(&new_d_next_o_id, sizeof(int)); + tables[(int)TableType.District].Update(ref districtPk, updateDistrictNextOIdTds, new_d_next_o_id_bytes, ctx); + + // insert into order and new order + PrimaryKey newOrderPk = new PrimaryKey((int)TableType.NewOrder, query.w_id, query.d_id, old_d_next_o_id); + PrimaryKey orderPk = new PrimaryKey((int)TableType.Order, query.w_id, query.d_id, old_d_next_o_id); + byte[] insertOrderData = orderBytePool.Checkout(); + SetField(TableType.Order, TableField.O_C_ID, insertOrderData, new ReadOnlySpan(&query.c_id, sizeof(int))); + long time = DateTime.Now.ToBinary(); + SetField(TableType.Order, TableField.O_ENTRY_D, insertOrderData, new ReadOnlySpan(&time, sizeof(long))); + SetField(TableType.Order, TableField.O_CARRIER_ID, insertOrderData, ZeroAsBytes); + SetField(TableType.Order, TableField.O_OL_CNT, insertOrderData, new ReadOnlySpan(&query.o_ol_cnt, sizeof(int))); + SetField(TableType.Order, TableField.O_ALL_LOCAL, insertOrderData, new ReadOnlySpan(&allLocal, sizeof(bool))); + bool success = tables[(int)TableType.Order].Insert(ref orderPk, insertOrderData, ctx); + if (!success) { + txnManager.Abort(ctx, callback); + return; + } + success = tables[(int)TableType.NewOrder].Insert(ref newOrderPk, emptyByteArr, ctx); + if (!success) { + txnManager.Abort(ctx, callback); + return; + } + + float total_amount = 0; + for (int i = 0; i < query.o_ol_cnt; i++) + { + ReadOnlySpan itemRow = tables[(int)TableType.Item].Read(new PrimaryKey((int)TableType.Item, query.ol_i_ids[i]), tables[(int)TableType.Item].GetSchema(), ctx); + ReadOnlySpan stockRow = tables[(int)TableType.Stock].Read(new PrimaryKey((int)TableType.Stock, query.ol_supply_w_id[i], query.ol_i_ids[i]), tables[(int)TableType.Stock].GetSchema(), ctx); + + // update stock + byte[] updateStockData = stockBytePool.Checkout(); + float i_price = BitConverter.ToSingle(ExtractField(TableType.Item, TableField.I_PRICE, itemRow)); + + int s_quantity = BitConverter.ToInt32(ExtractField(TableType.Stock, TableField.S_QUANTITY, stockRow)); + if (s_quantity >= query.ol_quantity[i] + 10) s_quantity -= query.ol_quantity[i]; + else s_quantity += 91 - query.ol_quantity[i]; + new ReadOnlySpan(&s_quantity, sizeof(int)).CopyTo(updateStockData.AsSpan(updateStockTds[0].Offset)); + + int s_ytd = BitConverter.ToInt32(ExtractField(TableType.Stock, TableField.S_YTD, stockRow)) + query.ol_quantity[i]; + new ReadOnlySpan(&s_ytd, sizeof(int)).CopyTo(updateStockData.AsSpan(updateStockTds[1].Offset)); + + int s_order_cnt = BitConverter.ToInt32(ExtractField(TableType.Stock, TableField.S_ORDER_CNT, stockRow)) + 1; + new ReadOnlySpan(&s_order_cnt, sizeof(int)).CopyTo(updateStockData.AsSpan(updateStockTds[2].Offset)); + + PrimaryKey stockPk = new PrimaryKey((int)TableType.Stock, query.ol_supply_w_id[i], query.ol_i_ids[i]); + if (query.ol_supply_w_id[i] != query.w_id) + { + int s_remote_cnt = BitConverter.ToInt32(ExtractField(TableType.Stock, TableField.S_REMOTE_CNT, stockRow)) + 1; + new ReadOnlySpan(&s_remote_cnt, sizeof(int)).CopyTo(updateStockData.AsSpan(updateStockTds[3].Offset)); + tables[(int)TableType.Stock].Update(ref stockPk, updateStockTds, updateStockData, ctx); + } else { + tables[(int)TableType.Stock].Update(ref stockPk, updateStockTds[0..3], updateStockData, ctx); + } + + // insert into order line + float ol_amount = i_price * query.ol_quantity[i]; + byte[] updateOrderLineData = orderLineBytePool.Checkout(); + fixed (int* b = query.ol_i_ids){ + SetField(TableType.OrderLine, TableField.OL_I_ID, updateOrderLineData, new ReadOnlySpan(&b[i], sizeof(int))); + } + fixed (int* b = query.ol_supply_w_id){ + SetField(TableType.OrderLine, TableField.OL_SUPPLY_W_ID, updateOrderLineData, new ReadOnlySpan(&b[i], sizeof(int))); + } + SetField(TableType.OrderLine, TableField.OL_DELIVERY_D, updateOrderLineData, ZeroAsBytes); + fixed (int* b = query.ol_quantity){ + SetField(TableType.OrderLine, TableField.OL_QUANTITY, updateOrderLineData, new ReadOnlySpan(&b[i], sizeof(int))); + } + SetField(TableType.OrderLine, TableField.OL_AMOUNT, updateOrderLineData, new ReadOnlySpan(&ol_amount, sizeof(float))); + ReadOnlySpan distInfo = ExtractField(TableType.Stock, TableField.S_DIST_01 + query.d_id - 1, stockRow); + SetField(TableType.OrderLine, TableField.OL_DIST_INFO, updateOrderLineData, distInfo); + PrimaryKey orderLinePk = new PrimaryKey((int)TableType.OrderLine, query.w_id, query.d_id, new_d_next_o_id, i); + success = tables[(int)TableType.OrderLine].Insert(ref orderLinePk, updateOrderLineData, ctx); + if (!success) { + txnManager.Abort(ctx, callback); + return; + } + // update total_amount + float c_discount = BitConverter.ToSingle(ExtractField(TableType.Customer, TableField.C_DISCOUNT, customerRow)); + float w_tax = BitConverter.ToSingle(ExtractField(TableType.Warehouse, TableField.W_TAX, warehouseRow)); + float d_tax = BitConverter.ToSingle(ExtractField(TableType.District, TableField.D_TAX, districtRow)); + total_amount += ol_amount * (1 - c_discount) * (1 + w_tax + d_tax); + + // should be fine since Insert and Update call AddToWriteset which makes a copy of the data? + stockBytePool.Return(updateStockData); + orderLineBytePool.Return(updateOrderLineData); + } + txnManager.CommitWithCallback(ctx, callback); + orderBytePool.Return(insertOrderData); + return; + } + + public unsafe void Payment(Query query, Action callback){ + TransactionContext ctx = txnManager.Begin(); + PrimaryKey warehousePk = new PrimaryKey((int)TableType.Warehouse, query.w_id); + ReadOnlySpan warehouseRow = tables[(int)TableType.Warehouse].Read(warehousePk, tables[(int)TableType.Warehouse].GetSchema(), ctx); + PrimaryKey districtPk = new PrimaryKey((int)TableType.District, query.w_id, query.d_id); + ReadOnlySpan districtRow = tables[(int)TableType.District].Read(districtPk, tables[(int)TableType.District].GetSchema(), ctx); + PrimaryKey customerPk; + ReadOnlySpan customerRow; + if (query.c_id == 0) { + // TODO + byte[] secondaryIndexKey = new byte[4+4+16]; + new ReadOnlySpan(&query.c_w_id, sizeof(int)).CopyTo(secondaryIndexKey); + new ReadOnlySpan(&query.c_d_id, sizeof(int)).CopyTo(secondaryIndexKey.AsSpan(sizeof(int))); + query.c_last.CopyTo(secondaryIndexKey.AsSpan(2 * sizeof(int))); + (customerRow, customerPk) = tables[(int)TableType.Customer].ReadSecondary(secondaryIndexKey, tables[(int)TableType.Customer].GetSchema(), ctx); + } else { + customerPk = new PrimaryKey((int)TableType.Customer, query.c_w_id, query.c_d_id, query.c_id); + customerRow = tables[(int)TableType.Customer].Read(customerPk, tables[(int)TableType.Customer].GetSchema(), ctx); + } + + // standard tpcc write to w_ytd + // update warehouse with increment W_YTD + float w_ytd = BitConverter.ToSingle(ExtractField(TableType.Warehouse, TableField.W_YTD, warehouseRow)) + query.h_amount; + tables[(int)TableType.Warehouse].Update(ref warehousePk, updateWarehouseTds, new ReadOnlySpan(&w_ytd, sizeof(float)), ctx); + + // update district with increment D_YTD + float d_ytd = BitConverter.ToSingle(ExtractField(TableType.District, TableField.D_YTD, districtRow)) + query.h_amount; + tables[(int)TableType.District].Update(ref districtPk, updateDistrictYtdTds, new ReadOnlySpan(&d_ytd, sizeof(float)), ctx); + + // update customer + byte[] updateCustomerData = customerBytePool.Checkout(); + ReadOnlySpan c_credit = ExtractField(TableType.Customer, TableField.C_CREDIT, customerRow); + float c_balance = BitConverter.ToSingle(ExtractField(TableType.Customer, TableField.C_BALANCE, customerRow)) - query.h_amount; + float c_ytd_payment = BitConverter.ToSingle(ExtractField(TableType.Customer, TableField.C_YTD_PAYMENT, customerRow)) + query.h_amount; + int c_payment_cnt = BitConverter.ToInt32(ExtractField(TableType.Customer, TableField.C_PAYMENT_CNT, customerRow)) + 1; + new ReadOnlySpan(&c_balance, sizeof(float)).CopyTo(updateCustomerData.AsSpan(updateCustomerTds[0].Offset)); + new ReadOnlySpan(&c_ytd_payment, sizeof(float)).CopyTo(updateCustomerData.AsSpan(updateCustomerTds[1].Offset)); + new ReadOnlySpan(&c_payment_cnt, sizeof(int)).CopyTo(updateCustomerData.AsSpan(updateCustomerTds[2].Offset)); + if (c_credit.SequenceEqual(BcAsBytes)) + { + ReadOnlySpan c_data_old = ExtractField(TableType.Customer, TableField.C_DATA, customerRow); + int offset = updateCustomerTds[3].Offset; + // TODO: need spaces between each; not according to spec but not used? + new ReadOnlySpan(&query.c_id, sizeof(int)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(int); + new ReadOnlySpan(&query.c_d_id, sizeof(int)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(int); + new ReadOnlySpan(&query.c_w_id, sizeof(int)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(int); + new ReadOnlySpan(&query.d_id, sizeof(int)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(int); + new ReadOnlySpan(&query.w_id, sizeof(int)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(int); + // TODO: need formatting; not according to spec but not used? + new ReadOnlySpan(&query.h_amount, sizeof(float)).CopyTo(updateCustomerData.AsSpan(offset)); + offset += sizeof(float); + c_data_old.Slice(0, 500 - offset).CopyTo(updateCustomerData.AsSpan(offset)); + tables[(int)TableType.Customer].Update(ref customerPk, updateCustomerTds, updateCustomerData, ctx); + } else { + tables[(int)TableType.Customer].Update(ref customerPk, updateCustomerTds[0..3], updateCustomerData, ctx); + } + + // insert into history + PrimaryKey historyPk = new PrimaryKey((int)TableType.History, query.w_id, query.d_id, query.c_w_id, query.c_d_id, query.c_id, DateTime.Now.ToBinary()); + byte[] insertHistoryData = historyBytePool.Checkout(); + SetField(TableType.History, TableField.H_AMOUNT, insertHistoryData, new ReadOnlySpan(&query.h_amount, sizeof(float))); + // manually set field to avoid additional allocation + (int _, int h_offset) = tables[(int)TableType.History].GetAttrMetadata((long)TableField.H_DATA); + ExtractField(TableType.Warehouse, TableField.W_NAME, warehouseRow).CopyTo(insertHistoryData.AsSpan(h_offset, 10)); + ExtractField(TableType.District, TableField.D_NAME, districtRow).CopyTo(insertHistoryData.AsSpan(h_offset + 10, 10)); + bool success = tables[(int)TableType.History].Insert(ref historyPk, insertHistoryData, ctx); + if (!success) { + txnManager.Abort(ctx, callback); + return; + } + txnManager.CommitWithCallback(ctx, callback); + customerBytePool.Return(updateCustomerData); + historyBytePool.Return(insertHistoryData); + } + + override public void RunTransactions(){ + for (int i = 0; i < cfg.iterationCount; i++){ + // txnManager.Reset(); + // txnManager.Run(); + + // new Thread(()=> rpcClient.PopulateTables(cfg, tpcCfg)).Start(); // populate tables in other machines + // PopulateTables(); + // PopulateItemTable(tables[6], txnManager, 1); + // Console.WriteLine($"done inserting"); + var opSw = Stopwatch.StartNew(); + // table and txnManager not used + WorkloadMultiThreadedTransactions(tables[6], txnManager, cfg.ratio); + cde.Wait(); + opSw.Stop(); + int txnAborts = cfg.datasetSize - successCounts.Sum(); + Console.WriteLine($"abort count {txnAborts}"); + long opMs = opSw.ElapsedMilliseconds; + stats?.AddTransactionalResult((0, opMs, 0, txnAborts)); + // txnManager.Terminate(); + } + + stats?.ShowAllStats(); + stats?.SaveStatsToFile(); + } + + override protected internal int WorkloadSingleThreadedTransactions(Table table, TransactionManager txnManager, int thread_idx, double ratio) + { + Action incrementCount = (success) => { + if (success) { + Interlocked.Increment(ref successCounts[thread_idx]); + // Console.WriteLine($"Thread {thread_idx} success count now {successCounts[thread_idx]}"); + // } else { + // Interlocked.Increment(ref abortCounts[thread_idx]); + // Console.WriteLine($"Thread {thread_idx} failed count now {abortCounts[thread_idx]}"); + } + cde.Signal(); + }; + for (int i = 0; i < cfg.perThreadDataCount; i += 1){ + int loc = i + (cfg.perThreadDataCount * thread_idx); + if (queries[loc].isNewOrder){ + NewOrder(queries[loc], incrementCount); + } else { + Payment(queries[loc], incrementCount); + } + } + // return abortCount; + + // cde.Wait(); + // return abortCount; + return 0; + } + + override protected internal int InsertSingleThreadedTransactions(Table table, TransactionManager txnManager, int thread_idx){ + int abortCount = 0; + int perThreadDataCount = keys.Count() / cfg.insertThreadCount; + int remainder = 0; + // have the last thread handle the remaining data + if (thread_idx == cfg.insertThreadCount - 1) { + if (perThreadDataCount == 0) thread_idx = 0; + remainder = keys.Count() % cfg.insertThreadCount; + } + // Console.WriteLine($"thread {thread_idx} writes from {(perThreadDataCount * thread_idx)} to {(perThreadDataCount * thread_idx) + perThreadDataCount + cfg.perTransactionCount - 1 + remainder}"); + for (int i = 0; i < perThreadDataCount + remainder; i += cfg.perTransactionCount){ + TransactionContext ctx = txnManager.Begin(); + for (int j = 0; j < cfg.perTransactionCount; j++) { + int loc = i + j + (perThreadDataCount * thread_idx); + if (loc >= keys.Count()) break; + bool insertSuccess = table.Insert(ref keys[loc], values[loc], ctx); + if (!insertSuccess) throw new Exception($"Failed to insert record {loc} {keys[loc]} for table {table.GetId()}"); + } + var success = txnManager.Commit(ctx); + if (!success){ + abortCount++; + } + } + return abortCount; + } + + public void GenerateTables(){ + GenerateItemData(); + for (int j = 0; j < tpcCfg.NumWh; j++) { + int w_id = 1 + j; + GenerateWarehouseData(w_id); + GenerateCustomerData(w_id); + GenerateDistrictData(w_id); + GenerateHistoryData(w_id); + GenerateOrderData(w_id); + GenerateNewOrderData(w_id); + GenerateOrderLineData(w_id); + GenerateStockData(w_id); + } + } + + public void PopulateTables(){ + int partitionsPerMachine = tpcCfg.PartitionsPerThread * cfg.threadCount; + for (int j = 0; j < partitionsPerMachine; j++) { + int w_id = (PartitionId * partitionsPerMachine) + 1 + j; + foreach (TableType tableType in Enum.GetValues(typeof(TableType))) + { + Console.WriteLine($"Start with populating {tableType} for {w_id}"); + switch (tableType) + { + case TableType.Customer: + PopulateCustomerTable(tables[(int)tableType], txnManager, w_id); + break; + case TableType.Item: + PopulateItemTable(tables[(int)tableType], txnManager, w_id); + break; + case TableType.Warehouse: + case TableType.District: + case TableType.History: + case TableType.NewOrder: + case TableType.Order: + case TableType.OrderLine: + case TableType.Stock: + PopulateTable(tables[(int)tableType], txnManager, String.Format(tableDataFiles[tableType], w_id)); + break; + default: + throw new ArgumentException("Invalid table type"); + } + Console.WriteLine($"Done with populating {tableType}"); + } + } + System.Console.WriteLine("done inserting"); + } + + public void PopulateTable(ShardedTable table, ShardedTransactionManager txnManager, string filename){ + LoadData(table, filename); + InsertMultiThreadedTransactions(table, txnManager); + } + public void PopulateItemTable(ShardedTable table, ShardedTransactionManager txnManager, int w_id){ + using (var reader = new BinaryReader(File.Open(tableDataFiles[TableType.Item], FileMode.Open, FileAccess.Read, FileShare.Read))) { + int numItems = reader.ReadInt32(); + values = new byte[numItems][]; + keys = new PrimaryKey[numItems]; + for (int i = 0; i < numItems; i++) + { + int pkLen = reader.ReadInt32(); + byte[] pkBytes = reader.ReadBytes(pkLen); + byte[] data = reader.ReadBytes(table.rowSize); + PrimaryKey pk = PrimaryKey.FromBytes(pkBytes); + + keys[i] = new PrimaryKey(pk.Table, w_id, pk.Key1); + values[i] = data; + } + } + InsertMultiThreadedTransactions(table, txnManager); + } + public void PopulateCustomerTable(ShardedTable table, ShardedTransactionManager txnManager, int w_id){ + using (var reader = new BinaryReader(File.Open(String.Format(tableDataFiles[TableType.Customer], w_id), FileMode.Open))) { + int numEntries = reader.ReadInt32(); + keys = new PrimaryKey[numEntries]; + values = new byte[numEntries][]; + + for (int i = 0; i < numEntries; i++) + { + int keyLen = reader.ReadInt32(); + byte[] pkBytes = reader.ReadBytes(keyLen); + PrimaryKey pk = PrimaryKey.FromBytes(pkBytes); + byte[] data = reader.ReadBytes(table.rowSize); + + keys[i] = pk; + values[i] = data; + } + + Dictionary secondaryIndex = new Dictionary(new ByteArrayComparer()); + int secondaryIndexCount = reader.ReadInt32(); + for (int i = 0; i < secondaryIndexCount; i++) + { + int keyLen = reader.ReadInt32(); + byte[] key = reader.ReadBytes(keyLen); + int pkLen = reader.ReadInt32(); + byte[] pk = reader.ReadBytes(pkLen); + secondaryIndex[key] = PrimaryKey.FromBytes(pk); + } + table.AddSecondaryIndex(secondaryIndex, TpccSchema.customerBuildTempPk); + } + InsertMultiThreadedTransactions(table, txnManager); + } + + private void LoadData(Table table, string filename){ + using (var reader = new BinaryReader(File.Open(filename, FileMode.Open))) { + int numItems = reader.ReadInt32(); + keys = new PrimaryKey[numItems]; + values = new byte[numItems][]; + for (int i = 0; i < numItems; i++) + { + int pkLen = reader.ReadInt32(); + byte[] pkBytes = reader.ReadBytes(pkLen); + byte[] data = reader.ReadBytes(table.rowSize); + PrimaryKey pk = PrimaryKey.FromBytes(pkBytes); + + keys[i] = pk; + values[i] = data; + } + } + } + public void GenerateWarehouseData(int w_id){ + Table table = tables[(int)TableType.Warehouse]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.Warehouse], w_id), FileMode.Create))) { + writer.Write(1); + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + RandomByteString(6, 10).CopyTo(span); // W_NAME + offset += 10; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // W_STREET_1 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // W_STREET_2 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // W_CITY + offset += 20; + RandomByteString(2, 2).CopyTo(span.Slice(offset)); // W_STATE + offset += 2; + RandZip().CopyTo(span.Slice(offset)); // W_ZIP + offset += 9; + BitConverter.GetBytes(0.1000f).CopyTo(span.Slice(offset)); // W_TAX + offset += 4; + BitConverter.GetBytes(3000000.00f).CopyTo(span.Slice(offset)); // W_YTD + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id); + // PK: W_ID + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + public void GenerateDistrictData(int w_id){ + Table table = tables[(int)TableType.District]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.District], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumDistrict); + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + RandomByteString(6, 10).CopyTo(span.Slice(offset)); // D_NAME + offset += 10; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // D_STREET_1 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // D_STREET_2 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // D_CITY + offset += 20; + RandomByteString(2, 2).CopyTo(span.Slice(offset)); // D_STATE + offset += 2; + RandZip().CopyTo(span.Slice(offset)); // D_ZIP + offset += 9; + RandFloat(0, 2000, 10000).CopyTo(span.Slice(offset)); // D_TAX + offset += 4; + BitConverter.GetBytes(30000).CopyTo(span.Slice(offset)); // D_YTD + offset += 4; + BitConverter.GetBytes(tpcCfg.NumOrder + 1).CopyTo(span.Slice(offset)); // D_NEXT_O_ID + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i); + // PK: D_W_ID, D_ID + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + public unsafe void GenerateCustomerData(int w_id){ + ConcurrentDictionary secondaryIndex = new ConcurrentDictionary(new ByteArrayComparer()); + // group rows by new index attribute + Dictionary> groupByAttr = new Dictionary>(); + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.Customer], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumDistrict * tpcCfg.NumCustomer); + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + for (int j = 1; j <= tpcCfg.NumCustomer; j++) + { + Table table = tables[(int)TableType.Customer]; + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + RandomByteString(8, 16).CopyTo(span.Slice(offset)); // C_FIRST + offset += 16; + new byte[]{(byte)'O', (byte)'E'}.CopyTo(span.Slice(offset)); // C_MIDDLE + offset += 2; + byte[] lastName = RandLastName(j <= 1000 ? j - 1 : NonUniformRandom(255, 0, 999)); + lastName.CopyTo(span.Slice(offset)); // C_LAST + offset += 16; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // C_STREET_1 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // C_STREET_2 + offset += 20; + RandomByteString(10, 20).CopyTo(span.Slice(offset)); // C_CITY + offset += 20; + RandomByteString(2, 2).CopyTo(span.Slice(offset)); // C_STATE + offset += 2; + RandZip().CopyTo(span.Slice(offset)); // C_ZIP + offset += 9; + RandomByteString(16, 16).CopyTo(span.Slice(offset)); // C_PHONE + offset += 16; + BitConverter.GetBytes(DateTime.Now.ToBinary()).CopyTo(span.Slice(offset)); // C_SINCE + offset += 8; + new byte[]{(byte)(Frnd.Next(0, 10) == 1 ? 'B' : 'G'), (byte)'C'}.CopyTo(span.Slice(offset)); // C_CREDIT + offset += 2; + BitConverter.GetBytes(50000).CopyTo(span.Slice(offset)); // C_CREDIT_LIM + offset += 4; + RandFloat(0, 5000, 10000).CopyTo(span.Slice(offset)); // C_DISCOUNT + offset += 4; + BitConverter.GetBytes(-10).CopyTo(span.Slice(offset)); // C_BALANCE + offset += 4; + BitConverter.GetBytes(10).CopyTo(span.Slice(offset)); // C_YTD_PAYMENT + offset += 4; + BitConverter.GetBytes(1).CopyTo(span.Slice(offset)); // C_PAYMENT_CNT + offset += 4; + BitConverter.GetBytes(0).CopyTo(span.Slice(offset)); // C_DELIVERY_CNT + offset += 4; + RandomByteString(300, 500).CopyTo(span.Slice(offset)); // C_DATA + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i, j); + // PK: C_W_ID, C_D_ID, C_ID + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + + byte[] key = BitConverter.GetBytes(w_id).Concat(BitConverter.GetBytes(i)).Concat(lastName).ToArray(); + if (!groupByAttr.ContainsKey(key)){ + groupByAttr[key] = new List<(PrimaryKey, byte[])>(); + } + groupByAttr[key].Add((pk, data)); + + + } + + foreach (var entry in groupByAttr){ + List<(PrimaryKey, byte[])> sameLastNames = entry.Value; + // sort by C_FIRST + sameLastNames.Sort((a, b) => { + return Util.CompareArrays(a.Item2[0..16], b.Item2[0..16]); + }); + + secondaryIndex[entry.Key] = sameLastNames[(sameLastNames.Count - 1) / 2].Item1; + } + } + + // write secondary index to file + writer.Write(secondaryIndex.Count); + foreach (var entry in secondaryIndex){ + byte[] key = entry.Key; + byte[] pk = entry.Value.ToBytes(); + writer.Write(key.Length); + writer.Write(key); + writer.Write(pk.Length); + writer.Write(pk); + } + } + } + public void GenerateHistoryData(int w_id){ + Table table = tables[(int)TableType.History]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.History], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumDistrict * tpcCfg.NumCustomer); + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + for (int j = 1; j <= tpcCfg.NumCustomer; j++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + BitConverter.GetBytes(10).CopyTo(span.Slice(offset)); // H_AMOUNT + offset += 4; + RandomByteString(12, 24).CopyTo(span.Slice(offset)); // H_DATA + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i, w_id, i, j, DateTime.Now.ToBinary()); + // PK: H_W_ID, H_D_ID, H_C_W_ID, H_C_D_ID, H_C_ID, H_DATE + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + } + public void GenerateNewOrderData(int w_id){ + Table table = tables[(int)TableType.NewOrder]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.NewOrder], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumDistrict * (tpcCfg.NumOrder - 2100)); + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + for (int j = 2101; j <= tpcCfg.NumOrder; j++) + { + byte[] data = new byte[table.rowSize]; + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i, j); + // PK: NO_W_ID, NO_D_ID, NO_O_ID + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + } + public void GenerateOrderData(int w_id){ + int[] cids = new int[tpcCfg.NumOrder]; + for (int i = 1; i <= tpcCfg.NumOrder; i++) { + cids[i-1] = i; + } + + Table table = tables[(int)TableType.Order]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.Order], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumDistrict * tpcCfg.NumOrder); + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + Util.Shuffle(Frnd, cids); + for (int j = 1; j <= tpcCfg.NumOrder; j++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + BitConverter.GetBytes(cids[j-1]).CopyTo(span.Slice(offset)); // O_C_ID + offset += 4; + BitConverter.GetBytes(DateTime.Now.ToBinary()).CopyTo(span.Slice(offset)); // O_ENTRY_D + offset += 8; + BitConverter.GetBytes(j < 2101 ? Frnd.Next(1,10) : 0).CopyTo(span.Slice(offset)); // O_CARRIER_ID + offset += 4; + int ol_cnt = Frnd.Next(5,15); + ol_cnts[i-1][j-1] = ol_cnt; + BitConverter.GetBytes(ol_cnt).CopyTo(span.Slice(offset)); // O_OL_CNT + offset += 4; + BitConverter.GetBytes(true).CopyTo(span.Slice(offset)); // O_ALL_LOCAL + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i, j); + // PK: O_W_ID, O_D_ID, O_ID + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + } + + // assumes that order data has been generated + public void GenerateOrderLineData(int w_id){ + Table table = tables[(int)TableType.OrderLine]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.OrderLine], w_id), FileMode.Create))) { + int count = ol_cnts.Sum(x => x.Sum(y => y)); + writer.Write(count); + + for (int i = 1; i <= tpcCfg.NumDistrict; i++) + { + for (int j = 1; j <= tpcCfg.NumOrder; j++) + { + int olCnt = ol_cnts[i-1][j-1]; + long oEntryD = entry_ds[i-1][j-1]; + + for (int k = 1; k <= olCnt; k++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + BitConverter.GetBytes(Frnd.Next(1,100000)).CopyTo(span.Slice(offset)); // OL_I_ID + offset += 4; + BitConverter.GetBytes(w_id).CopyTo(span.Slice(offset)); // OL_SUPPLY_W_ID + offset += 4; + BitConverter.GetBytes(j < 2101 ? oEntryD : 0).CopyTo(span.Slice(offset)); // OL_DELIVERY_D + offset += 8; + BitConverter.GetBytes(5).CopyTo(span.Slice(offset)); // OL_QUANTITY + offset += 4; + BitConverter.GetBytes(j < 2101 ? 0 : Frnd.Next(1, 999999) / 100f).CopyTo(span.Slice(offset)); // OL_AMOUNT + offset += 4; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // OL_DIST_INFO + + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i, j, k); + // PK: OL_W_ID, OL_D_ID, OL_O_ID, OL_NUMBER + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + } + } + + private void GenerateItemData(){ + Table table = tables[(int)TableType.Item]; + using (var writer = new BinaryWriter(File.Open(tableDataFiles[TableType.Item], FileMode.Create))) { + writer.Write(tpcCfg.NumItem); + for (int i = 1; i <= tpcCfg.NumItem; i++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + BitConverter.GetBytes(Frnd.Next(1,10000)).CopyTo(span.Slice(offset)); // I_IM_ID + offset += 4; + RandomByteString(14, 24).CopyTo(span.Slice(offset)); // I_NAME + offset += 24; + BitConverter.GetBytes(Frnd.Next(1,100)).CopyTo(span.Slice(offset)); // I_PRICE + offset += 4; + byte[] i_data = RandomByteString(26, 50); + int strLen = Encoding.ASCII.GetString(i_data).IndexOf(' '); + if (Frnd.Next(1,10) == 1) { + int start = Frnd.Next(0, strLen - ORIGINAL.Length); + for (int j = 0; j < ORIGINAL.Length; j++) { + i_data[start + j] = ORIGINAL[j]; + } + } + i_data.CopyTo(span.Slice(offset)); // I_DATA + // PK: I_ID + PrimaryKey pk = new PrimaryKey(table.GetId(), i); + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + + public void GenerateStockData(int w_id) { + Table table = tables[(int)TableType.Stock]; + using (var writer = new BinaryWriter(File.Open(String.Format(tableDataFiles[TableType.Stock], w_id), FileMode.Create))) { + writer.Write(tpcCfg.NumStock); + for (int i = 1; i <= tpcCfg.NumStock; i++) + { + byte[] data = new byte[table.rowSize]; + Span span = new Span(data); + + int offset = 0; + BitConverter.GetBytes(Frnd.Next(10,100)).CopyTo(span.Slice(offset)); // S_QUANTITY + offset += 4; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_01 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_02 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_03 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_04 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_05 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_06 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_07 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_08 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_09 + offset += 24; + RandomByteString(24, 24).CopyTo(span.Slice(offset)); // S_DIST_10 + offset += 24; + BitConverter.GetBytes(0).CopyTo(span.Slice(offset)); // S_YTD + offset += 4; + BitConverter.GetBytes(0).CopyTo(span.Slice(offset)); // S_ORDER_CNT + offset += 4; + BitConverter.GetBytes(0).CopyTo(span.Slice(offset)); // S_REMOTE_CNT + offset += 4; + byte[] s_data = RandomByteString(26, 50); + int strLen = Encoding.ASCII.GetString(s_data).IndexOf(' '); + if (Frnd.Next(1,10) == 1) { + int start = Frnd.Next(0, strLen - ORIGINAL.Length); + for (int j = 0; j < ORIGINAL.Length; j++) { + s_data[start + j] = ORIGINAL[j]; + } + } + s_data.CopyTo(span.Slice(offset)); // S_DATA + PrimaryKey pk = new PrimaryKey(table.GetId(), w_id, i); + // PK: S_W_ID, S_I_ID + + byte[] pkBytes = pk.ToBytes(); + writer.Write(pkBytes.Length); + writer.Write(pkBytes); + writer.Write(data); + } + } + } + + private ReadOnlySpan ExtractField(TableType tableType, TableField field, ReadOnlySpan row) { + (int size, int offset) = tables[(int)tableType].GetAttrMetadata((long)field); + return row.Slice(offset, size); + } + + private void SetField(TableType tableType, TableField field, byte[] row, ReadOnlySpan value) { + (int size, int offset) = tables[(int)tableType].GetAttrMetadata((long)field); + value.CopyTo(row.AsSpan(offset)); + } + + // private (byte[], TupleDesc[]) BuildUpdate(byte[] data, TupleDesc[] tds, TableType tableType, TableField field, byte[] value){ + // int size = tables[(int)tableType].GetAttrMetadata((long)field).Item1; + // int offset = tds.Length == 0 ? 0 : tds[tds.Length - 1].Offset + tds[tds.Length - 1].Size; + // return (data.Concat(value).ToArray(), tds.Append(new TupleDesc((int)field, size, offset)).ToArray()); + // } + private void PrintByteArray(byte[] arr){ + foreach (byte b in arr){ + Console.Write(b + " "); + } + Console.WriteLine(); + } + + /// + /// Generates a random byte array with the given length, + /// padded with spaces until it reaches the maximum length + /// + /// + /// minimum size of the string + /// + /// + /// Random string + private static byte[] RandomByteString(int strMin, int strMax) + { + byte[] randomString = new byte[strMax]; + int stringLen = Frnd.Next(strMin, strMax); + for (int x = 0; x < strMax; ++x) + { + if (x < stringLen) + randomString[x] = RandHold[Frnd.Next(0, 62)]; + else + randomString[x] = SpaceAsByte; + } + + return randomString; + } + + /// + /// Generates a random zip code byte array with the given length + /// + /// Random string + private static byte[] RandZip() + { + byte[] holdZip = new byte[5]; + for (int x = 0; x < 4; ++x) + { + holdZip[x] = ZipRandHold[Frnd.Next(0, 9)]; + } + for (int x = 4; x < 5; ++x) + { + holdZip[x] = (byte)'1'; + } + + return holdZip; + } + + /// + /// Generates a 8 byte array representation of a random float + /// + /// min value random + /// max value random + /// divisor + /// 8 byte + private static byte[] RandFloat(int min, int max, int divisor){ + return BitConverter.GetBytes((float)Frnd.Next(min, max) / divisor); + } + + /// + /// Derived from coco's Random.h + /// + /// + /// + /// + /// + /// + private static int NonUniformRandom(int A, int min, int max){ + return ((Frnd.Next(0, A) | Frnd.Next(min, max)) % (max - min + 1)) + min; + } + + private static byte[] RandLastName(int n){ + // int len = LastNames[n / 100].Length + LastNames[(n / 10) % 10].Length + LastNames[n % 10].Length; + byte[] lastname = new byte[16]; + LastNames[n / 100].CopyTo(lastname, 0); + LastNames[(n / 10) % 10].CopyTo(lastname, LastNames[n / 100].Length); + LastNames[n % 10].CopyTo(lastname, LastNames[n / 100].Length + LastNames[(n / 10) % 10].Length); + return lastname; + } + +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TransactionContext.cs b/cs/research/darq/DistributedTransactions/TransactionContext.cs new file mode 100644 index 000000000..b03a3a9b7 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TransactionContext.cs @@ -0,0 +1,206 @@ +using System.Runtime.InteropServices; +using System.Text; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace DB { +/// +/// Data structure holding transaction context +/// +public class TransactionContext { + // TODO: find better value + private static int SET_SIZE = 0; + internal TransactionStatus status; + internal int startTxnNum; + internal List Rset; // byte[] is the entire record + internal List<(TupleDesc[], byte[])> Wset; // byte[] corresponds to the TupleDesc + internal List WsetKeys; + internal List RsetKeys; + public long tid; + public Dictionary tables; + public Action callback; + public TransactionContext(Dictionary tables){ + this.tables = tables; + } + + public void Init(int startTxn, long tid){ + this.startTxnNum = startTxn; + this.tid = tid; + status = TransactionStatus.Idle; + Rset = new List(SET_SIZE); + Wset = new List<(TupleDesc[], byte[])>(SET_SIZE); + WsetKeys = new List(SET_SIZE); + RsetKeys = new List(SET_SIZE); + } + + public bool InReadSet(ref PrimaryKey tupleId){ + return GetReadsetKeyIndex(ref tupleId) != -1; + } + public bool InWriteSet(ref PrimaryKey tupleId){ + return GetWriteSetKeyIndex(ref tupleId) != -1; + } + + public (TupleDesc[], byte[]) GetFromWriteset(PrimaryKey tupleId){ + int index = GetWriteSetKeyIndex(ref tupleId); + if (index == -1){ + return (null, null); + } + return (Wset[index].Item1, Wset[index].Item2); + } + public (TupleDesc[], byte[]) GetFromWriteset(int i){ + return (Wset[i].Item1, Wset[i].Item2); + } + + public ReadOnlySpan GetFromReadset(PrimaryKey tupleId){ + int index = GetReadsetKeyIndex(ref tupleId); + if (index == -1){ + return null; + } + return Rset[index]; + } + + public void AddReadSet(PrimaryKey tupleId, ReadOnlySpan val){ + // TODO: varlen + if (val.Length != tables[tupleId.Table].rowSize){ + throw new ArgumentException($"Readset value length {val.Length} does not match table row size {tables[tupleId.Table].rowSize}"); + } + Rset.Add(val.ToArray()); + RsetKeys.Add(tupleId); + } + + public void AddWriteSet(ref PrimaryKey tupleId, TupleDesc[] tupleDescs, ReadOnlySpan val){ + int index = GetWriteSetKeyIndex(ref tupleId); + if (index != -1){ + (TupleDesc[], byte[]) existing = Wset[index]; + // List result = new List(); + // int start = 0; + // foreach (TupleDesc td in existing.Item2){ + // bool included = false; + // int newStart = 0; + // foreach (TupleDesc newTd in tupleDescs){ + // if (td.Attr == newTd.Attr){ + // included = true; + + // result.AddRange(val.Slice(newStart, td.Size)); + // } + // newStart += newTd.Size; + // } + // if (!included) finalSize += td.Size; + + + // if (td.Attr == tupleDescs[0].Attr){ + // result.AddRange(val.ToArray()); + // } else { + // result.AddRange(existing.Item3.AsSpan(start, td.Size).ToArray()); + // } + // start += td.Size; + // } + + // calculate final size + int finalSize = existing.Item2.Length; + foreach (TupleDesc td in tupleDescs){ + bool included = false; + foreach (TupleDesc existingTd in existing.Item1){ + if (td.Attr == existingTd.Attr){ + included = true; + } + } + if (!included) finalSize += td.Size; + } + + // copy values, replacing existing values with new ones + byte[] newVal = new byte[finalSize]; + Span newValSpan = newVal; + bool[] includedTd = new bool[tupleDescs.Length]; + foreach (TupleDesc existingTd in existing.Item1){ + bool included = false; + for (int i = 0; i < tupleDescs.Length; i++){ + TupleDesc newTd = tupleDescs[i]; + if (existingTd.Attr == newTd.Attr){ + included = true; + includedTd[i] = true; + val.Slice(newTd.Offset, newTd.Size).CopyTo(newValSpan.Slice(existingTd.Offset, newTd.Size)); + break; + } + } + if (!included) { + existing.Item2.AsSpan(existingTd.Offset, existingTd.Size).CopyTo(newValSpan.Slice(existingTd.Offset, existingTd.Size)); + } + } + + // add remaining values, also to tupleDescs + TupleDesc[] newTupleDescs = new TupleDesc[existing.Item1.Length + includedTd.Count(x => !x)]; + existing.Item1.CopyTo(newTupleDescs, 0); + int start = existing.Item2.Length; + int j = existing.Item1.Length; + for (int i = 0; i < tupleDescs.Length; i++){ + if (!includedTd[i]){ + val.Slice(tupleDescs[i].Offset, tupleDescs[i].Size).CopyTo(newValSpan.Slice(start, tupleDescs[i].Size)); + newTupleDescs[j++] = tupleDescs[i]; + start += tupleDescs[i].Size; + } + } + + Wset.Add((newTupleDescs, newVal)); + WsetKeys.Add(tupleId); + } else { + Wset.Add((tupleDescs, val.ToArray())); + WsetKeys.Add(tupleId); + } + } + + public List GetReadsetKeys(){ + return RsetKeys; + } + public ListGetWritesetKeys(){ + return WsetKeys; + } + + private int GetWriteSetKeyIndex(ref PrimaryKey tupleId){ + var span = CollectionsMarshal.AsSpan(WsetKeys); + for (int i = span.Length-1; i >= 0; i--){ + ref PrimaryKey pk = ref span[i]; + // TupleDesc[] tupleDescs = span[i].Item1; + // if (tupleDescs[0].Attr == -1){ + // return i; + // } + // if (pk.Equals(tupleId)){ + // return i; + // } + if (pk.Table == tupleId.Table && pk.Key1 == tupleId.Key1 + && pk.Key2 == tupleId.Key2 && pk.Key3 == tupleId.Key3 + && pk.Key4 == tupleId.Key4 && pk.Key5 == tupleId.Key5 + && pk.Key6 == tupleId.Key6 + ){ + return i; + } + // if (Wset[i].Item1.Equals(tupleId)){ + // return i; + // } + } + return -1; + } + + private int GetReadsetKeyIndex(ref PrimaryKey tupleId){ + var span = CollectionsMarshal.AsSpan(RsetKeys); + for (int i = span.Length-1; i >= 0; i--){ + ref PrimaryKey pk = ref span[i]; + if (pk.Table == tupleId.Table && pk.Key1 == tupleId.Key1 + && pk.Key2 == tupleId.Key2 && pk.Key3 == tupleId.Key3 + && pk.Key4 == tupleId.Key4 && pk.Key5 == tupleId.Key5 + && pk.Key6 == tupleId.Key6 + ){ + return i; + } + // if (Rset[i].Item1.Equals(tupleId)){ + // return i; + // } + } + return -1; + } + + public override string ToString(){ + return $"Readset: {string.Join(Environment.NewLine, GetReadsetKeys())}\nWriteset: {string.Join(Environment.NewLine, GetWritesetKeys())}"; + } +} + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TransactionManager.cs b/cs/research/darq/DistributedTransactions/TransactionManager.cs new file mode 100644 index 000000000..b9ca049d2 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TransactionManager.cs @@ -0,0 +1,362 @@ +using System.Collections; +using System.Collections.Concurrent; +using System.Threading; +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; +using System.Runtime.InteropServices; + +namespace DB { +public class TransactionManager { + private static readonly int MAX_QUEUE_SIZE = 4; + internal BlockingCollection txnQueue; + internal static int pastTnumCircularBufferSize = 1 << 14; + internal TransactionContext[] tnumToCtx = new TransactionContext[pastTnumCircularBufferSize]; // write protected by spinlock, atomic with txnc increment + internal int txnc = 0; + internal int tid = 0; + internal Thread[] committer; + internal SimpleObjectPool ctxPool; + internal List active = new List(); // list of active transaction contexts, protected by spinlock + internal SpinLock sl = new SpinLock(); + private IWriteAheadLog? wal; + protected Dictionary tables; + protected ILogger logger; + public TransactionManager(int numThreads, Dictionary tables, IWriteAheadLog? wal = null, ILogger logger = null){ + this.wal = wal; + this.logger = logger; + this.tables = tables; + ctxPool = new SimpleObjectPool(() => new TransactionContext(tables)); + committer = new Thread[numThreads]; + txnQueue = new BlockingCollection(MAX_QUEUE_SIZE); + + for (int i = 0; i < committer.Length; i++) { + committer[i] = new Thread(() => { + try { + while (true) { + TransactionContext ctx = txnQueue.Take(); + ValidateAndWrite(ctx); + } + } catch (ThreadInterruptedException){ + System.Console.WriteLine("Terminated"); + } + }); + } + } + + /// + /// Create a new transaction context + /// + /// Newly created transaction context + public TransactionContext Begin(){ + var ctx = ctxPool.Checkout(); + ctx.Init(startTxn: txnc, NewTransactionId()); + if (wal != null) { + wal.Begin(ctx.tid); + } + return ctx; + } + + /// + /// Submit a transaction context to be committed. Blocks until commit is completed + /// + /// Context to commit + /// True if the transaction committed, false otherwise + public bool Commit(TransactionContext ctx){ + PrintDebug($"adding ctx to queue for commit", ctx); + ctx.status = TransactionStatus.Pending; + txnQueue.Add(ctx); + while (!Util.IsTerminalStatus(ctx.status)){ + Thread.Yield(); + } + if (ctx.status == TransactionStatus.Aborted){ + return false; + } else if (ctx.status == TransactionStatus.Committed) { + return true; + } + return false; + } + + public void CommitWithCallback(TransactionContext ctx, Action callback){ + PrintDebug($"adding ctx to queue for commit", ctx); + ctx.status = TransactionStatus.Pending; + ctx.callback = callback; + txnQueue.Add(ctx); + } + + /// + /// Spawns a thread that continuously polls the queue to + /// validate and commit a transaction context + /// + public void Run(){ + for (int i = 0; i < committer.Length; i++) { + committer[i].Start(); + } + } + + public void Reset() { + txnQueue.CompleteAdding(); + txnQueue = new BlockingCollection(MAX_QUEUE_SIZE); + active = new List(); + sl = new SpinLock(); + txnc = 0; + tid = 0; + tnumToCtx = new TransactionContext[pastTnumCircularBufferSize]; + ctxPool = new SimpleObjectPool(() => new TransactionContext(tables)); + } + + public void Terminate(){ + for (int i = 0; i < committer.Length; i++) { + committer[i]?.Interrupt(); + } + if (wal != null) wal.Terminate(); + } + + /// + /// Mutates "active", sets fields to mark active transaction + /// + /// + /// + public bool Validate(TransactionContext ctx){ + PrintDebug($"Validating own keys", ctx); + bool lockTaken = false; // signals if this thread was able to acquire lock + int finishTxn; + List finish_active; + try { + sl.Enter(ref lockTaken); + finishTxn = txnc; + finish_active = new List(active); + active.Add(ctx); + } finally { + if (lockTaken) sl.Exit(); + lockTaken = false; + } + // PrintDebug($"Committing {ctx.startTxn} to {finishTxn}", ctx); + + // validate + for (int i = ctx.startTxnNum + 1; i <= finishTxn; i++){ + // Console.WriteLine((i & (pastTnumCircularBufferSize - 1)) + " readset: " + ctx.GetReadset().Count + "; writeset:" + ctx.GetWriteset().Count); + // foreach (var x in tnumToCtx[i % pastTnumCircularBufferSize].GetWriteset()){ + // Console.Write($"{x}, "); + // } + foreach (ref var tupleId in CollectionsMarshal.AsSpan(ctx.GetReadsetKeys())){ + // Console.WriteLine($"scanning for {keyAttr}"); + // TODO: rename keyattr since tupleid is redundant + if (tnumToCtx[i & (pastTnumCircularBufferSize - 1)].InWriteSet(ref tupleId)){ + // Console.WriteLine($"1 ABORT for {ctx.tid} because conflict: {tupleId} in {tnumToCtx[i & (pastTnumCircularBufferSize - 1)].tid}"); + return false; + } + } + } + + foreach (TransactionContext pastTxn in finish_active){ + foreach (var item in pastTxn.GetWritesetKeys()){ + PrimaryKey tupleId = item; + if (ctx.InReadSet(ref tupleId) || ctx.InWriteSet(ref tupleId)){ + // Console.WriteLine($"2 ABORT for {ctx.tid} because conflict: {tupleId} in {pastTxn.tid}"); + return false; + } + } + } + return true; + } + + virtual public void Write(TransactionContext ctx, Action commit){ + PrintDebug("Write phase", ctx); + bool lockTaken = false; // signals if this thread was able to acquire lock + List writesetKeys = ctx.GetWritesetKeys(); + for(int i = 0; i < writesetKeys.Count; i++){ + PrimaryKey tupleId = writesetKeys[i]; + var item = ctx.GetFromWriteset(i); + // TODO: should not throw exception here, but if it does, abort. + // failure here means crashed before commit. would need to rollback + tables[tupleId.Table].Write(ref tupleId, item.Item1, item.Item2); + } + // TODO: verify that should be logged before removing from active + if (wal != null){ + commit(ctx.tid, LogType.Commit); + // wal.Finish(new LogEntry(prevLsn, ctx.tid, LogType.Commit)); + } + ctx.callback?.Invoke(true); + // assign num + int finalTxnNum; + try { + sl.Enter(ref lockTaken); + txnc += 1; // TODO: deal with int overflow + finalTxnNum = txnc; + active.Remove(ctx); + if (tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)] != null){ + ctxPool.Return(tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)]); + } + tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)] = ctx; + } finally { + if (lockTaken) sl.Exit(); + lockTaken = false; + } + // PrintDebug("Write phase done", ctx); + } + + public void Abort(TransactionContext ctx, Action callback = null){ + PrintDebug($"Aborting tid {ctx.tid}"); + bool lockTaken = false; // signals if this thread was able to acquire lock + // TODO: verify that should be logged before removing from active + if (wal != null){ + wal.Finish(ctx.tid, LogType.Abort); + } + if (ctx.callback == null){ + ctx.callback = callback; + } + ctx.callback?.Invoke(false); + try { + sl.Enter(ref lockTaken); + active.Remove(ctx); + } finally { + if (lockTaken) sl.Exit(); + lockTaken = false; + } + } + + virtual protected void ValidateAndWrite(TransactionContext ctx) { + bool valid = Validate(ctx); + + if (valid) { + ctx.status = TransactionStatus.Validated; + Write(ctx, (tid, type) => wal.Finish(tid, type)); + ctx.status = TransactionStatus.Committed; + } else { + Abort(ctx); + ctx.status = TransactionStatus.Aborted; + } + } + + virtual public void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[TM TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } + + private long NewTransactionId(){ + return Interlocked.Increment(ref tid); + } +} + +public class ShardedTransactionManager : TransactionManager { + private RpcClient rpcClient; + private IWriteAheadLog? wal; + private ConcurrentDictionary> txnIdToOKDarqLsns = new ConcurrentDictionary>(); // tid to num shards waiting on + public ShardedTransactionManager(int numThreads, RpcClient rpcClient, Dictionary tables, IWriteAheadLog? wal = null, ILogger logger = null) : base(numThreads, tables.ToDictionary(kv => kv.Key, kv => (Table)kv.Value), wal, logger){ + this.rpcClient = rpcClient; + this.wal = wal; + } + + public void MarkAcked(long tid, TransactionStatus status, long darqLsn, long shard){ + TransactionContext? ctx = active.Find(ctx => ctx.tid == tid); + if (ctx == null) return; // already aborted, ignore + if (status == TransactionStatus.Aborted){ + Abort(ctx); + ctx.status = TransactionStatus.Aborted; + // TODO: should consume all existing OKs + return; + } else if (status != TransactionStatus.Validated){ + throw new Exception($"Invalid status {status} for tid {tid}"); + } + + txnIdToOKDarqLsns[tid].Add((darqLsn, shard)); + wal.RecordOk(tid, shard); + + PrintDebug($"Marked acked", ctx); + + if (txnIdToOKDarqLsns[tid].Count == rpcClient.GetNumServers() - 1){ + PrintDebug($"done w validation", ctx); + ctx.status = TransactionStatus.Validated; + Write(ctx, (tid, type) => wal.Finish2pc(tid, type, txnIdToOKDarqLsns[tid])); + ctx.status = TransactionStatus.Committed; + } + } + + override protected void ValidateAndWrite(TransactionContext ctx){ + ctx.status = TransactionStatus.Pending; + // validate own, need a map to track which has responded + bool valid = Validate(ctx); + + if (valid) { + // split writeset into shards + Dictionary> shardToWriteset = new Dictionary>(); + List writesetKeys = ctx.GetWritesetKeys(); + for (int i = 0; i < writesetKeys.Count; i++){ + PrimaryKey tupleId = writesetKeys[i]; + (TupleDesc[] td, byte[] val) = ctx.GetFromWriteset(i); + long shardDest = rpcClient.HashKeyToDarqId(tupleId); + if (!rpcClient.IsLocalKey(tupleId)){ + if (!shardToWriteset.ContainsKey(shardDest)){ + shardToWriteset[shardDest] = new List<(PrimaryKey, TupleDesc[], byte[])>(); + } + shardToWriteset[shardDest].Add((tupleId, td, val)); + } + } + if (shardToWriteset.Count > 0) { + + if (txnIdToOKDarqLsns.ContainsKey(ctx.tid)) throw new Exception($"Ctx TID {ctx.tid} already started validating?"); + PrintDebug($"Created waiting for OK list", ctx); + txnIdToOKDarqLsns[ctx.tid] = new List<(long, long)>(); + for (int shard = 0; shard < rpcClient.GetNumServers(); shard++){ + if (shard == rpcClient.GetId() || shardToWriteset.ContainsKey(shard)) continue; + txnIdToOKDarqLsns[ctx.tid].Add((-1, shard)); // hacky way to indicate that we don't need to wait for this shard + } + // send out prepare messages and wait; the commit is finished by calls to MarkAcked + wal.Prepare(shardToWriteset, ctx.tid); + } else { + PrintDebug($"Commit on local, no waiting needed", ctx); + Write(ctx, (tid, type) => wal.Finish(tid, type)); + ctx.status = TransactionStatus.Committed; + } + } else { + Abort(ctx); + ctx.status = TransactionStatus.Aborted; + } + + } + + override public void Write(TransactionContext ctx, Action commit){ + PrintDebug("Write phase", ctx); + bool lockTaken = false; // signals if this thread was able to acquire lock + List writesetKeys = ctx.GetWritesetKeys(); + for(int i = 0; i < writesetKeys.Count; i++){ + PrimaryKey tupleId = writesetKeys[i]; + if (!rpcClient.IsLocalKey(tupleId)) continue; + var item = ctx.GetFromWriteset(i); + // TODO: should not throw exception here, but if it does, abort. + // failure here means crashed before commit. would need to rollback + tables[tupleId.Table].Write(ref tupleId, item.Item1, item.Item2); + } + // TODO: verify that should be logged before removing from active + if (wal != null){ + commit(ctx.tid, LogType.Commit); + // wal.Finish(new LogEntry(prevLsn, ctx.tid, LogType.Commit)); + } + ctx.callback?.Invoke(true); + // assign num + int finalTxnNum; + try { + sl.Enter(ref lockTaken); + txnc += 1; // TODO: deal with int overflow + finalTxnNum = txnc; + active.Remove(ctx); + if (tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)] != null){ + ctxPool.Return(tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)]); + } + tnumToCtx[finalTxnNum & (pastTnumCircularBufferSize - 1)] = ctx; + } finally { + if (lockTaken) sl.Exit(); + lockTaken = false; + } + // PrintDebug("Write phase done", ctx); + } + + public RpcClient GetRpcClient(){ + return rpcClient; + } + + void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[STM {rpcClient.GetId()} TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } + +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/TransactionProcessorService.cs b/cs/research/darq/DistributedTransactions/TransactionProcessorService.cs new file mode 100644 index 000000000..19cab0974 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/TransactionProcessorService.cs @@ -0,0 +1,433 @@ +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; +using Google.Protobuf; +using Grpc.Core; +using FASTER.client; +using System.Diagnostics; +using Grpc.Net.Client; +using darq; +using darq.client; +using System.Collections.Concurrent; + +namespace DB { + +public class DarqTransactionProcessorService : TransactionProcessor.TransactionProcessorBase, IDarqProcessor { + private ShardedTransactionManager txnManager; + private ConcurrentDictionary<(long, long), long> externalToInternalTxnId = new ConcurrentDictionary<(long, long), long>(); + private ConcurrentDictionary txnIdToTxnCtx = new ConcurrentDictionary(); + private DarqWal wal; + private long partitionId; + // from darqProcessor + private Darq backend; + private readonly DarqBackgroundTask _backgroundTask; + private readonly DarqBackgroundWorkerPool workerPool; + private readonly ManualResetEventSlim terminationStart, terminationComplete; + private Thread refreshThread, processingThread; + private ColocatedDarqProcessorClient processorClient; + + private IDarqProcessorClientCapabilities capabilities; + + private SimpleObjectPool stepRequestPool = new(() => new StepRequest()); + private int nextWorker = 0; + private StepRequest reusableRequest = new(); + Dictionary tables; + Dictionary clusterMap; + private TpccBenchmark tpccBenchmark; + protected ILogger logger; + public DarqTransactionProcessorService( + long partitionId, + Dictionary tables, + ShardedTransactionManager txnManager, + DarqWal wal, + Darq darq, + DarqBackgroundWorkerPool workerPool, + Dictionary clusterMap, + ILogger logger = null + ) { + this.tables = tables; + this.logger = logger; + this.txnManager = txnManager; + this.partitionId = partitionId; + this.wal = wal; + this.clusterMap = clusterMap; + + int PartitionsPerThread = 2; + int ThreadCount = 2; + int MachineCount = 2; + + BenchmarkConfig ycsbCfg = new BenchmarkConfig( + ratio: 0.2, + attrCount: 10, + threadCount: ThreadCount, + insertThreadCount: 12, + iterationCount: 1, + nCommitterThreads: 5 + // perThreadDataCount: 1000000 + ); + TpccConfig tpccConfig = new TpccConfig( + numWh: PartitionsPerThread * ThreadCount * MachineCount, + partitionsPerThread: PartitionsPerThread + // newOrderCrossPartitionProbability: 0, + // paymentCrossPartitionProbability: 0 + // numCustomer: 10, + // numDistrict: 10, + // numItem: 10, + // numOrder: 10, + // numStock: 10 + ); + + tpccBenchmark = new TpccBenchmark((int)partitionId, tpccConfig, ycsbCfg, tables, txnManager); + + backend = darq; + _backgroundTask = new DarqBackgroundTask(backend, workerPool, session => new TransactionProcessorProducerWrapper(clusterMap, session)); + terminationStart = new ManualResetEventSlim(); + terminationComplete = new ManualResetEventSlim(); + this.workerPool = workerPool; + backend.ConnectToCluster(out _); + + + + _backgroundTask.BeginProcessing(); + + refreshThread = new Thread(() => + { + while (!terminationStart.IsSet) + backend.Refresh(); + terminationComplete.Set(); + }); + refreshThread.Start(); + + processorClient = new ColocatedDarqProcessorClient(backend); + processingThread = new Thread(() => + { + processorClient.StartProcessing(this); + }); + processingThread.Start(); + // TODO(Tianyu): Hacky + // spin until we are sure that we have started + while (capabilities == null) {} + } + public override Task Read(ReadRequest request, ServerCallContext context) + { + long internalTid = GetOrRegisterTid(request.PartitionId, request.Tid); + Table table = tables[request.Key.Table]; + TransactionContext ctx = txnIdToTxnCtx[internalTid]; + PrimaryKey tupleId = new PrimaryKey(request.Key.Table, request.Key.Keys.ToArray()[0], request.Key.Keys.ToArray()[1], request.Key.Keys.ToArray()[2], request.Key.Keys.ToArray()[3], request.Key.Keys.ToArray()[4], request.Key.Keys.ToArray()[5]); + TupleDesc[] tupleDescs = table.GetSchema(); + ReadReply reply = new ReadReply{ Value = ByteString.CopyFrom(table.Read(tupleId, tupleDescs, ctx))}; + return Task.FromResult(reply); + } + + public override Task ReadSecondary(ReadSecondaryRequest request, ServerCallContext context) + { + PrintDebug($"Reading secondary from rpc service"); + long internalTid = GetOrRegisterTid(request.PartitionId, request.Tid); + Table table = tables[request.Table]; + TransactionContext ctx = txnIdToTxnCtx[internalTid]; + var (value, pk) = table.ReadSecondary(request.Key.ToArray(), table.GetSchema(), ctx); + ReadSecondaryReply reply = new ReadSecondaryReply{ Value = ByteString.CopyFrom(value), Key = new PbPrimaryKey{ Keys = {pk.Key1, pk.Key2, pk.Key3, pk.Key4, pk.Key5, pk.Key6}, Table = pk.Table}}; + return Task.FromResult(reply); + } + + public override Task PopulateTables(PopulateTablesRequest request, ServerCallContext context) + { + PrintDebug($"Populating tables from rpc service"); + BenchmarkConfig cfg = new BenchmarkConfig( + seed: request.Seed, + ratio: request.Ratio, + threadCount: request.ThreadCount, + attrCount: request.AttrCount, + perThreadDataCount: request.PerThreadDataCount, + iterationCount: request.IterationCount, + perTransactionCount: request.PerTransactionCount, + nCommitterThreads: request.NCommitterThreads + ); + + TpccConfig tpccCfg = new TpccConfig( + numWh: request.NumWh, + numDistrict: request.NumDistrict, + numCustomer: request.NumCustomer, + numItem: request.NumItem, + numOrder: request.NumOrder, + numStock: request.NumStock, + newOrderCrossPartitionProbability: request.NewOrderCrossPartitionProbability, + paymentCrossPartitionProbability: request.PaymentCrossPartitionProbability, + partitionsPerThread: request.PartitionsPerThread + ); + TpccBenchmark tpccBenchmark = new TpccBenchmark((int)partitionId, tpccCfg, cfg, tables, txnManager); + txnManager.Run(); + tpccBenchmark.PopulateTables(); + txnManager.Terminate(); + PopulateTablesReply reply = new PopulateTablesReply{ Success = true}; + return Task.FromResult(reply); + } + + public override Task EnqueueWorkload(EnqueueWorkloadRequest request, ServerCallContext context) + { + switch (request.Workload) { + case "ycsb_single": + // only uses single table + // TableBenchmark ycsb_single = new FixedLenTableBenchmark("ycsb_local", ycsbCfg, wal); + // ycsb_single.RunTransactions(); + break; + case "ycsb": + // only uses single table + // TableBenchmark b = new ShardedBenchmark("2pc", ycsbCfg, txnManager, tables[0], wal); + // b.RunTransactions(); + break; + case "tpcc": + tpccBenchmark.RunTransactions(); + // tpccBenchmark.GenerateTables(); + break; + case "tpcc-populate": + tpccBenchmark.PopulateTables(); + break; + default: + throw new NotImplementedException(); + } + + + // Table table = tables[0]; + // txnManager.Run(); + // var ctx = txnManager.Begin(); + // Console.WriteLine("Should go to own"); + // var own = table.Read(new PrimaryKey(table.GetId(), 0), new TupleDesc[]{new TupleDesc(12345, 8, 0)}, ctx); + // Console.WriteLine(own.ToString()); + // foreach (var b in own.ToArray()){ + // Console.WriteLine(b); + // } + // Console.WriteLine("Should RPC:"); + // var other = table.Read(new PrimaryKey(table.GetId(), 1), new TupleDesc[]{new TupleDesc(12345, 8, 0)}, ctx); + // Console.WriteLine(other.ToString()); + // foreach (var b in other.ToArray()){ + // Console.WriteLine(b); + // } + // Console.WriteLine("Starting commit"); + // txnManager.Commit(ctx); + // txnManager.Terminate(); + EnqueueWorkloadReply enqueueWorkloadReply = new EnqueueWorkloadReply{Success = true}; + return Task.FromResult(enqueueWorkloadReply); + } + + // typically used for Prepare() and Commit() + public override async Task WriteWalEntry(WalRequest request, ServerCallContext context) + { + // PrintDebug($"Writing to WAL from {request.PartitionId}"); + LogEntry entry = LogEntry.FromBytes(request.Message.ToArray()); + + if (entry.type == LogType.Prepare || entry.type == LogType.Commit) + { + long internalTid = GetOrRegisterTid(request.PartitionId, request.Tid); + entry.lsn = internalTid; // TODO: HACKY reuse, we keep tid to be original tid + entry.prevLsn = request.PartitionId; // TODO: hacky place to put sender id + PrintDebug($"Stepping prepare/commit {entry.lsn}"); + } else { + PrintDebug($"Stepping ok/ack {entry.tid}"); + } + + var stepRequest = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(stepRequest); + // TODO: do we need to step messages consumed, self, and out messages + requestBuilder.AddSelfMessage(entry.ToBytes()); + await capabilities.Step(requestBuilder.FinishStep()); + stepRequestPool.Return(stepRequest); + backend.EndAction(); + return new WalReply{Success = true}; + } + + private long GetOrRegisterTid(long partitionId, long tid) { + PrintDebug($"Getting or registering tid: ({partitionId}, {tid})"); + if (externalToInternalTxnId.ContainsKey((partitionId, tid))) + return externalToInternalTxnId[(partitionId, tid)]; + + var ctx = txnManager.Begin(); + long internalTid = ctx.tid; + PrintDebug("Registering new tid: " + internalTid); + externalToInternalTxnId[(partitionId, tid)] = internalTid; + txnIdToTxnCtx[internalTid] = ctx; + return internalTid; + } + + public void Dispose(){ + foreach (var table in tables.Values) { + table.Dispose(); + } + txnManager.Terminate(); + terminationStart.Set(); + // TODO(Tianyu): this shutdown process is unsafe and may leave things unsent/unprocessed in the queue + backend.ForceCheckpoint(); + Thread.Sleep(1000); + _backgroundTask.StopProcessing(); + _backgroundTask.Dispose(); + processorClient.StopProcessingAsync().GetAwaiter().GetResult(); + processorClient.Dispose(); + terminationComplete.Wait(); + refreshThread.Join(); + processingThread.Join(); + } + + public Darq GetBackend() => backend; + + public bool ProcessMessage(DarqMessage m){ + PrintDebug($"Processing message"); + bool recoveryMode = false; + switch (m.GetMessageType()){ + case DarqMessageType.IN: + { + unsafe + { + fixed (byte* b = m.GetMessageBody()) + { + int signal = *(int*)b; + // This is a special termination signal + if (signal == -1) + { + m.Dispose(); + // Return false to signal that there are no more messages to process and the processing + // loop can exit + return false; + } + } + } + + LogEntry entry = LogEntry.FromBytes(m.GetMessageBody().ToArray()); + var requestBuilder = new StepRequestBuilder(reusableRequest); + // requestBuilder.AddRecoveryMessage(m.GetMessageBody()); + switch (entry.type) + { + // Coordinator side + case LogType.Ok: + { + PrintDebug($"Got OK log entry: {entry}"); + txnManager.MarkAcked(entry.tid, TransactionStatus.Validated, m.GetLsn(), entry.prevLsn); + m.Dispose(); + return true; + } + case LogType.Ack: + { + PrintDebug($"Got ACK log entry: {entry}"); + // can ignore in DARQ since we know out commit message is sent + m.Dispose(); + return true; + } + // Worker side + case LogType.Prepare: + { + PrintDebug($"Got prepare log entry: {entry}"); + requestBuilder.MarkMessageConsumed(m.GetLsn()); + long sender = entry.prevLsn; // hacky + long internalTid = entry.lsn; // "" + + // add each write to context before validating + TransactionContext ctx = txnIdToTxnCtx[internalTid]; + for (int i = 0; i < entry.pks.Length; i++) + { + PrimaryKey pk = entry.pks[i]; + ctx.AddWriteSet(ref pk, entry.tupleDescs[i], entry.vals[i]); + } + bool success = txnManager.Validate(ctx); + PrintDebug($"Validated at node {partitionId}: {success}; now sending OK to {sender}"); + if (success) { + LogEntry okEntry = new LogEntry(partitionId, entry.tid, LogType.Ok); + requestBuilder.AddOutMessage(new DarqId(sender), okEntry.ToBytes()); + } + break; + } + case LogType.Commit: + { + PrintDebug($"Got commit log entry: {entry}"); + requestBuilder.MarkMessageConsumed(m.GetLsn()); + long sender = entry.prevLsn; // hacky + long internalTid = entry.lsn; // "" + + txnManager.Write(txnIdToTxnCtx[internalTid], (tid, type) => wal.Finish(tid, type)); + + PrintDebug($"Committed at node {partitionId}; now sending ACK to {sender}"); + LogEntry ackEntry = new LogEntry(partitionId, entry.tid, LogType.Ack); + requestBuilder.AddOutMessage(new DarqId(sender), ackEntry.ToBytes()); + break; + } + default: + throw new NotImplementedException(); + } + + + m.Dispose(); + var v = capabilities.Step(requestBuilder.FinishStep()); + Debug.Assert(v.GetAwaiter().GetResult() == StepStatus.SUCCESS); + return true; + } + case DarqMessageType.RECOVERY: // this is on recovery; TODO: do we need to double pass? + PrintDebug($"Recovering?, got log"); + if (recoveryMode) { + LogEntry entry = LogEntry.FromBytes(m.GetMessageBody().ToArray()); + + PrintDebug($"Recovering, got log entry: {entry}"); + + } + m.Dispose(); + return true; + default: + throw new NotImplementedException(); + } + } + + public void OnRestart(IDarqProcessorClientCapabilities capabilities) { + this.capabilities = capabilities; + this.wal.SetCapabilities(capabilities); + } + + void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[TPS {partitionId} TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } +} + + +public class TransactionProcessorProducerWrapper : IDarqProducer +{ + private Dictionary clusterMap; + private ConcurrentDictionary clients = new(); + private DprSession session; + + public TransactionProcessorProducerWrapper(Dictionary clusterMap, DprSession session) + { + this.clusterMap = clusterMap; + this.session = session; + } + + public void Dispose() {} + + public void EnqueueMessageWithCallback(DarqId darqId, ReadOnlySpan message, Action callback, long producerId, long lsn) + { + LogEntry entry = LogEntry.FromBytes(message.ToArray()); + var client = clients.GetOrAdd(darqId, + _ => new TransactionProcessor.TransactionProcessorClient(clusterMap[darqId])); + var walRequest = new WalRequest + { + Message = ByteString.CopyFrom(message), + Tid = entry.tid, + PartitionId = producerId, + Lsn = lsn, + }; + Task.Run(async () => + { + try + { + await client.WriteWalEntryAsync(walRequest); + callback(true); + } + catch + { + callback(false); + throw; + } + }); + } + + public void ForceFlush() + { + // TODO(Tianyu): Not implemented for now + } +} +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/Util.cs b/cs/research/darq/DistributedTransactions/Util.cs new file mode 100644 index 000000000..52ed37737 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/Util.cs @@ -0,0 +1,492 @@ +using System; +using System.Drawing; +using System.Runtime.InteropServices; +using SharpNeat.Utility; + + +namespace DB { + + public enum TransactionStatus { + Idle, + Pending, + Validated, + Committed, + Aborted + } + + public enum OperationType { + Read, + Insert, + Update + } + + public unsafe struct Pointer { + public Pointer(IntPtr ptr, int size){ + Size = size; + IntPointer = ptr; + Ptr = ptr.ToPointer(); + } + public Pointer(void* ptr, int size){ + Size = size; + IntPointer = new IntPtr(ptr); + Ptr = ptr; + } + public int Size; + public IntPtr IntPointer; + public void* Ptr; + } + + // public struct Operation { + // public Operation(OperationType type, TupleId tupleId, TupleDesc[] tupleDescs, ReadOnlySpan val){ + // if (type != OperationType.Read && Util.IsEmpty(val)) { + // throw new ArgumentException("Writes must provide a non-null value"); + // } + // Type = type; + // TupleID = tupleId; + // TupleDescs = tupleDescs; + // Value = val.ToArray(); + // } + // public OperationType Type; + // public TupleId TupleID; + // public TupleDesc[] TupleDescs; + // public byte[] Value; + + + // public byte[] ToBytes(){ + // using (MemoryStream m = new MemoryStream()) { + // using (BinaryWriter writer = new BinaryWriter(m)) { + // writer.Write(BitConverter.GetBytes((int)Type)); + // writer.Write(TupleID.Key); + // writer.Write(TupleID.TableHash); + // writer.Write(TupleDescs.Count()); + // foreach (TupleDesc desc in TupleDescs){ + // writer.Write(desc.Attr); + // writer.Write(desc.Size); + // } + // writer.Write(Value); + // } + // return m.ToArray(); + // } + // } + + // public static Operation FromBytes(byte[] data) { + // Operation op = new Operation(); + // using (MemoryStream m = new MemoryStream(data)) { + // using (BinaryReader reader = new BinaryReader(m)) { + // op.Type = (OperationType)reader.ReadInt32(); + + // long key = reader.ReadInt64(); + // int tableHash = reader.ReadInt32(); + // op.TupleID = new TupleId(key, tableHash); + + // int tupleDescLength = reader.ReadInt32(); + // TupleDesc[] descs = new TupleDesc[tupleDescLength]; + // for (int i = 0; i < tupleDescLength; i++){ + // TupleDesc desc = new TupleDesc(); + // desc.Attr = reader.ReadInt64(); + // desc.Size = reader.ReadInt32(); + // descs[i] = desc; + // } + // } + // } + // return op; + // } + + // } + + public readonly struct PrimaryKey{ //} : IEquatable{ + public readonly long Key1 = 0; + public readonly long Key2 = 0; + public readonly long Key3 = 0; + public readonly long Key4 = 0; + public readonly long Key5 = 0; + public readonly long Key6 = 0; + public readonly int Table; + + // public PrimaryKey (int t, params long[] keys){ + // Table = t; + // for (int i = 0; i < keys.Length; i++){ + // switch (i){ + // case 0: + // Key1 = keys[i]; + // break; + // case 1: + // Key2 = keys[i]; + // break; + // case 2: + // Key3 = keys[i]; + // break; + // case 3: + // Key4 = keys[i]; + // break; + // case 4: + // Key5 = keys[i]; + // break; + // case 5: + // Key6 = keys[i]; + // break; + // default: + // throw new ArgumentException("Too many keys"); + // } + // } + // } + public PrimaryKey(int t, long k1, long k2, long k3, long k4, long k5, long k6){ + Table = t; + Key1 = k1; + Key2 = k2; + Key3 = k3; + Key4 = k4; + Key5 = k5; + Key6 = k6; + } + + public PrimaryKey(int t, long k1, long k2, long k3, long k4, long k5){ + Table = t; + Key1 = k1; + Key2 = k2; + Key3 = k3; + Key4 = k4; + Key5 = k5; + } + + public PrimaryKey(int t, long k1, long k2, long k3, long k4){ + Table = t; + Key1 = k1; + Key2 = k2; + Key3 = k3; + Key4 = k4; + } + + public PrimaryKey(int t, long k1, long k2, long k3){ + Table = t; + Key1 = k1; + Key2 = k2; + Key3 = k3; + } + + public PrimaryKey(int t, long k1, long k2){ + Table = t; + Key1 = k1; + Key2 = k2; + } + + public PrimaryKey(int t, long k1){ + Table = t; + Key1 = k1; + } + + public override bool Equals(object o){ + if (o == null || GetType() != o.GetType()){ + return false; + } + PrimaryKey other = (PrimaryKey)o; + if (Table != other.Table) return false; + if (Key1 != other.Key1) return false; + if (Key2 != other.Key2) return false; + if (Key3 != other.Key3) return false; + if (Key4 != other.Key4) return false; + if (Key5 != other.Key5) return false; + if (Key6 != other.Key6) return false; + return true; + } + + public override int GetHashCode(){ + int hash = 17; + hash = hash * 31 + Key1.GetHashCode(); + hash = hash * 31 + Key2.GetHashCode(); + hash = hash * 31 + Key3.GetHashCode(); + hash = hash * 31 + Key4.GetHashCode(); + hash = hash * 31 + Key5.GetHashCode(); + hash = hash * 31 + Key6.GetHashCode(); + return hash * 31 + Table; + } + + public static int SizeOf = sizeof(long) * 6 + sizeof(int); + + // public override bool Equals(object o){ + // if (o == null || GetType() != o.GetType()){ + // return false; + // } + // PrimaryKey other = (PrimaryKey)o; + // if (Table != other.Table) return false; + // if (Key1 != other.Key1) return false; + // if (Key2 != other.Key2) return false; + // if (Key3 != other.Key3) return false; + // if (Key4 != other.Key4) return false; + // if (Key5 != other.Key5) return false; + // if (Key6 != other.Key6) return false; + // return true; + // } + + // public override int GetHashCode(){ + // int hash = 17; + // foreach (long l in Keys){ + // hash = hash * 31 + l.GetHashCode(); + // } + // return hash * 31 + Table; + // } + + public override string ToString(){ + return $"PK ({Key1}, {Key2}, {Key3}, {Key4}, {Key5}, {Key6}) Table {Table}"; + } + + public unsafe byte[] ToBytes(){ + byte[] arr = new byte[SizeOf]; + + fixed (byte* b = arr) { + var head = b; + *(long*)head = Key1; + head += sizeof(long); + *(long*)head = Key2; + head += sizeof(long); + *(long*)head = Key3; + head += sizeof(long); + *(long*)head = Key4; + head += sizeof(long); + *(long*)head = Key5; + head += sizeof(long); + *(long*)head = Key6; + head += sizeof(long); + *(int*)head = Table; + } + return arr; + } + + public static unsafe PrimaryKey FromBytes(byte[] data){ + PrimaryKey result; + fixed (byte* b = data) { + var head = b; + long k1 = *(long*)head; + head += sizeof(long); + long k2 = *(long*)head; + head += sizeof(long); + long k3 = *(long*)head; + head += sizeof(long); + long k4 = *(long*)head; + head += sizeof(long); + long k5 = *(long*)head; + head += sizeof(long); + long k6 = *(long*)head; + head += sizeof(long); + int table = *(int*)head; + result = new PrimaryKey(table, k1, k2, k3, k4, k5, k6); + // long[] keys = new long[6]; + // for (int i = 0; i < keys.Length; i++){ + // keys[i] = *(long*)head; + // head += sizeof(long); + // } + // result = new PrimaryKey(*(int*)head, keys); + } + return result; + } + + // public bool Equals(TupleId o){ + // return Key == o.Key && Attr == o.Attr && Table == o.Table; + // } + + // public override bool Equals([NotNullWhen(true)] object o) + // { + // if (o == null || GetType() != o.GetType()) + // { + // return false; + // } + // return Equals((TupleId)o); + // } + + // public override int GetHashCode(){ + // return (int)Key + (int)Attr + Table.GetHashCode(); + // } + + } + + public struct TupleDesc { + public TupleDesc(long attr, int size, int offset){ + Attr = attr; + Size = size; + Offset = offset; + } + public long Attr; + public int Size; + public int Offset; + public static int SizeOf = sizeof(long) + sizeof(int) + sizeof(int); + + public override string ToString(){ + return $"(Attr:{Attr}, Size:{Size}, Offset:{Offset})"; + } + + public override bool Equals(object o){ + if (o == null || GetType() != o.GetType()){ + return false; + } + TupleDesc other = (TupleDesc)o; + return Attr == other.Attr && Size == other.Size && Offset == other.Offset; + } + + public override int GetHashCode(){ + return Attr.GetHashCode() + Size.GetHashCode() + Offset.GetHashCode(); + } + + public unsafe byte[] ToBytes(){ + byte[] arr = new byte[SizeOf]; + fixed (byte* b = arr) { + var head = b; + *(long*)head = Attr; + head += sizeof(long); + *(int*)head = Size; + head += sizeof(int); + *(int*)head = Offset; + } + return arr; + } + + public static unsafe TupleDesc FromBytes(byte[] data){ + TupleDesc result; + fixed (byte* b = data) { + var head = b; + long attr = *(long*)head; + head += sizeof(long); + int size = *(int*)head; + head += sizeof(int); + int offset = *(int*)head; + result = new TupleDesc(attr, size, offset); + } + return result; + } + } + + // public struct KeyAttr { + // public KeyAttr(PrimaryKey key, long attr){ + // Key = key; + // Attr = attr; + // } + // public PrimaryKey Key; + // public long Attr; + // public int Size => Key.Size + sizeof(long); + + // public override string ToString(){ + // return $"KA ({Key}, {Attr})"; + // } + + // public override bool Equals(object o){ + // if (o == null || GetType() != o.GetType()){ + // return false; + // } + // KeyAttr other = (KeyAttr)o; + // return Key.Equals(other.Key) && Attr == other.Attr; + // } + + // public override int GetHashCode(){ + // return Key.GetHashCode() + Attr.GetHashCode(); + // } + + // public unsafe byte[] ToBytes(){ + // byte[] arr = new byte[Size]; + // fixed (byte* b = arr) { + // var head = b; + // Key.ToBytes().CopyTo(new Span(head, Key.Size)); + // head += Key.Size; + // *(long*)head = Attr; + // } + // return arr; + // } + + // public static unsafe KeyAttr FromBytes(byte[] data) { + // KeyAttr result = new KeyAttr(); + + // fixed (byte* b = data) { + // var head = b; + // result.Key = PrimaryKey.FromBytes(new Span(head, data.Length - sizeof(long)).ToArray()); + // head += result.Key.Size; + // result.Attr = *(long*)head; + // } + // return result; + // } + // } + + // public class OCCComparer : IEqualityComparer + // { + // public bool Equals(KeyAttr x, KeyAttr y) + // { + // return x.Key.Equals(y.Key); + // } + + // public int GetHashCode(KeyAttr obj) + // { + // return obj.GetHashCode(); + // } + // } + + public class ByteArrayComparer : IEqualityComparer { + public bool Equals(byte[] left, byte[] right) { + if ( left == null || right == null ) { + return left == right; + } + return left.SequenceEqual(right); + } + public int GetHashCode(byte[] key) { + if (key == null) + throw new ArgumentNullException("key"); + return key.Sum(b => b); + } + } + + public class Util { + public static bool IsEmpty(ReadOnlySpan val){ + if (val.IsEmpty) { + return true; + } + foreach (byte b in val) + { + if (b != 0) + { + return false; // If any element is not 0, return false + } + } + return true; // All elements are 0 + } + + public static bool IsTerminalStatus(TransactionStatus status){ + return status == TransactionStatus.Committed || status == TransactionStatus.Aborted; + } + + public static int GetLength(byte[][] arr){ + int len = 0; + foreach (byte[] a in arr){ + len += a.Length; + } + return len; + } + + public static int CompareArrays(IEnumerable first, IEnumerable second) where T : IComparable + { + using (var firstEnum = first.GetEnumerator()) + using (var secondEnum = second.GetEnumerator()) + { + while (firstEnum.MoveNext()) + { + if (!secondEnum.MoveNext()) + return 1; + + int cmp = firstEnum.Current.CompareTo(secondEnum.Current); + if (cmp != 0) + return cmp; + } + + return secondEnum.MoveNext() ? -1 : 0; + } + } + + public static void Shuffle (FastRandom rng, T[] array) + { + int n = array.Length; + while (n > 1) + { + int k = rng.Next(n--); + T temp = array[n]; + array[n] = array[k]; + array[k] = temp; + } + } + } + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/WriteAheadLog.cs b/cs/research/darq/DistributedTransactions/WriteAheadLog.cs new file mode 100644 index 000000000..a2417bba5 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/WriteAheadLog.cs @@ -0,0 +1,186 @@ +using FASTER.darq; +using FASTER.libdpr; +using System.Diagnostics; +using FASTER.common; +using System.Collections.Concurrent; + + +namespace DB +{ +public interface IWriteAheadLog +{ + public long RecordOk(long tid, long shard); + public long Begin(long tid); + public long Write(long tid, ref PrimaryKey pk, TupleDesc[] tupleDescs, byte[] val); + + public long Finish(long tid, LogType type); + public long Prepare(Dictionary> shardToWriteset, long tid); + public long Finish2pc(long tid, LogType type, List<(long, long)> okLsnsToConsume); + // public void SetCapabilities(IDarqProcessorClientCapabilities capabilities); + public void Terminate(); + // public void Recover(); + +} + +public class DarqWal : IWriteAheadLog { + + protected long currLsn = 0; + protected IDarqProcessorClientCapabilities capabilities; + protected DarqId partitionId; + protected SimpleObjectPool requestPool; + internal ConcurrentDictionary txnTbl = new ConcurrentDictionary(); // ongoing transactions mapped to most recent lsn + // requestBuilders should last for a Begin,Write,Finish cycle or TODO and should NEVER overlap + protected ConcurrentDictionary requestBuilders = new ConcurrentDictionary(); + protected ILogger logger; + public DarqWal(DarqId partitionId, ILogger logger = null){ + this.partitionId = partitionId; + this.logger = logger; + requestPool = new SimpleObjectPool(() => new StepRequest()); + } + + public long Begin(long tid){ + // use GetOrAdd because may be in the middle of requestBuilder cycle + StepRequestBuilder requestBuilder = requestBuilders.GetOrAdd(tid, _ => new StepRequestBuilder(requestPool.Checkout())); + + LogEntry entry = new LogEntry(GetNewLsn(), tid, LogType.Begin); + entry.lsn = entry.prevLsn; + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + txnTbl[tid] = entry.lsn; + + return entry.lsn; + } + + public long Write(long tid, ref PrimaryKey pk, TupleDesc[] tupleDescs, byte[] val){ + LogEntry entry = new LogEntry(txnTbl[tid], tid, pk, tupleDescs, val); + StepRequestBuilder requestBuilder = requestBuilders[entry.tid]; // throw error if doesn't exist + entry.lsn = GetNewLsn(); + + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + txnTbl[tid] = entry.lsn; + return entry.lsn; + } + + // TODO: add field for shard in log entry + public long RecordOk(long tid, long shard){ + // OK message is stepped by itself + StepRequestBuilder requestBuilder = new StepRequestBuilder(requestPool.Checkout()); + + LogEntry entry = new LogEntry(txnTbl[tid], tid, LogType.Ok); + entry.lsn = GetNewLsn(); + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + txnTbl[tid] = entry.lsn; + + StepAndReturnRequestBuilder(requestBuilder); + return entry.lsn; + } + + /// + /// Commits or aborts a transaction + /// + /// + /// lsn of finish log + public long Finish(long tid, LogType type){ + // on abort, requestBuilder may not have been created + StepRequestBuilder requestBuilder = requestBuilders.GetOrAdd(tid, _ => new StepRequestBuilder(requestPool.Checkout())); + + long lsn = GetNewLsn(); + LogEntry entry = new LogEntry(txnTbl.GetValueOrDefault(tid, lsn), tid, type); + entry.lsn = lsn; + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + + StepAndReturnRequestBuilder(requestBuilder); + requestBuilders.Remove(entry.tid, out _); + txnTbl.TryRemove(tid, out _); + return entry.lsn; + } + + /// + /// Writes prepare log and sends out prepare messages to shards + /// + /// + /// + /// lsn of prepare log + public long Prepare(Dictionary> shardToWriteset, long tid) { + StepRequestBuilder requestBuilder = requestBuilders[tid]; // throw error if doesn't exist + + // should be first + LogEntry entry = new LogEntry(GetNewLsn(), tid, LogType.Prepare); + // TODO: make sure it is correct lsn/prevLsn values + entry.lsn = entry.prevLsn; + entry.pks = new PrimaryKey[0]; + entry.tupleDescs = new TupleDesc[0][]; + entry.vals = new byte[0][]; + // entry.pks = shardToWriteset.SelectMany(x => x.Value.Select(y => y.Item1)).ToArray(); + // entry.tupleDescs = shardToWriteset.SelectMany(x => x.Value.Select(y => y.Item2)).ToArray(); + // entry.vals = shardToWriteset.SelectMany(x => x.Value.Select(y => y.Item3)).ToArray(); + + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + txnTbl[tid] = entry.lsn; + + // add out message to each shard + foreach (var shard in shardToWriteset) { + long darqId = shard.Key; + List<(PrimaryKey, TupleDesc[], byte[])> writeset = shard.Value; + LogEntry outEntry = new LogEntry(0, tid, writeset.Select(x => x.Item1).ToArray(), writeset.Select(x => x.Item2).ToArray(), writeset.Select(x => x.Item3).ToArray()); + // PrintDebug($"sending prepare msg to {darqId} with keys {string.Join(", ", writeset.Select(x => x.Item1))}"); + requestBuilder.AddOutMessage(new DarqId(darqId), outEntry.ToBytes()); + } + + StepAndReturnRequestBuilder(requestBuilder); + return entry.lsn; + } + + /// + /// Commit or abort a transaction and consume OK messages from 2PC + /// + /// + /// + /// + /// lsn of finish log + public long Finish2pc(long tid, LogType type, List<(long, long)> darqLsnsToConsume){ + StepRequestBuilder requestBuilder = requestBuilders[tid]; // throw error if doesn't exist + + LogEntry entry = new LogEntry(txnTbl[tid], tid, type); + entry.lsn = GetNewLsn(); + requestBuilder.AddRecoveryMessage(entry.ToBytes()); + foreach (var item in darqLsnsToConsume) { + long darqLsn = item.Item1; + requestBuilder.MarkMessageConsumed(darqLsn); + } + // todo: fix lsn + LogEntry outEntry = new LogEntry(0, entry.tid, LogType.Commit); + foreach (var item in darqLsnsToConsume) { + long shard = item.Item2; + requestBuilder.AddOutMessage(new DarqId(shard), outEntry.ToBytes()); + } + + StepAndReturnRequestBuilder(requestBuilder); + requestBuilders.Remove(entry.tid, out _); + txnTbl.TryRemove(tid, out _); + return entry.lsn; + } + protected long GetNewLsn() { + return Interlocked.Increment(ref currLsn); + } + + protected void StepAndReturnRequestBuilder(StepRequestBuilder requestBuilder){ + StepRequest stepRequest = requestBuilder.FinishStep(); + var v = capabilities.Step(stepRequest); + Debug.Assert(v.GetAwaiter().GetResult() == StepStatus.SUCCESS); + requestPool.Return(stepRequest); + } + + public void SetCapabilities(IDarqProcessorClientCapabilities capabilities) { + this.capabilities = capabilities; + } + public void Terminate(){ + return; + } + + void PrintDebug(string msg, TransactionContext ctx = null){ + if (logger != null) logger.LogInformation($"[WAL {partitionId} TID {(ctx != null ? ctx.tid : -1)}]: {msg}"); + } + +} + +} \ No newline at end of file diff --git a/cs/research/darq/DistributedTransactions/distributedTransactions.proto b/cs/research/darq/DistributedTransactions/distributedTransactions.proto new file mode 100644 index 000000000..ba8e86cd5 --- /dev/null +++ b/cs/research/darq/DistributedTransactions/distributedTransactions.proto @@ -0,0 +1,88 @@ +syntax = "proto3"; + +option csharp_namespace = "DB"; + +service TransactionProcessor { + rpc Read (ReadRequest) returns (ReadReply); + rpc EnqueueWorkload (EnqueueWorkloadRequest) returns (EnqueueWorkloadReply); + rpc WriteWalEntry (WalRequest) returns (WalReply); + rpc ReadSecondary (ReadSecondaryRequest) returns (ReadSecondaryReply); + rpc PopulateTables (PopulateTablesRequest) returns (PopulateTablesReply); + // rpc Update (WriteRequest) returns (WriteReply); + // rpc Insert (WriteRequest) returns (WriteReply); + // rpc Begin (BeginRequest) returns (BeginReply); + // rpc Commit (CommitRequest) returns (CommitReply); + // rpc Prepare (PrepareRequest) returns (PrepareReply); +} + +message PbPrimaryKey { + repeated int64 keys = 1; + int32 table = 2; +} + +message ReadRequest { + PbPrimaryKey key = 1; + int64 tid = 3; // tid of transaction context + int64 partitionId = 10; +} + +message ReadReply { + bytes value = 1; +} + +message ReadSecondaryRequest { + bytes key = 1; + int32 table = 2; + int64 tid = 3; // tid of transaction context + int64 partitionId = 10; +} + +message ReadSecondaryReply { + bytes value = 1; + PbPrimaryKey key = 2; +} + +message PopulateTablesRequest { + // benchmarkCfg + int32 seed = 1; + double ratio = 2; + int32 threadCount = 3; + int32 attrCount = 4; + int32 perThreadDataCount = 5; + int32 iterationCount = 6; + int32 perTransactionCount = 8; + int32 nCommitterThreads = 9; + // tpcCfg + int32 numWh = 10; + int32 numDistrict = 11; + int32 numCustomer = 12; + int32 numOrder = 13; + int32 numItem = 14; + int32 numStock = 15; + int32 newOrderCrossPartitionProbability = 16; + int32 paymentCrossPartitionProbability = 17; + int32 partitionsPerThread = 18; +} + +message PopulateTablesReply { + bool success = 1; +} + +message EnqueueWorkloadRequest { + string workload = 1; +} + +message EnqueueWorkloadReply { + bool success = 1; +} + +message WalRequest { + bytes message = 1; + int64 tid = 2; + int64 lsn = 3; + int64 partitionId = 10; +} + +message WalReply { + bool success = 1; +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/AggregateEventProcessor.cs b/cs/research/darq/EventProcessing/AggregateEventProcessor.cs new file mode 100644 index 000000000..6a8fd78aa --- /dev/null +++ b/cs/research/darq/EventProcessing/AggregateEventProcessor.cs @@ -0,0 +1,92 @@ +using System.Diagnostics; +using dse.services; +using pubsub; +using StepRequest = pubsub.StepRequest; + +namespace EventProcessing; + +public class AggregateEventProcessor : SpPubSubEventHandler +{ + private int outputTopic; + private PubsubCapabilities capabilities; + private long currentBatchStartTime = -1; + private long largestTimestampInBatch; + private Dictionary currentBatch; + private StepRequest currentRequest; + + public AggregateEventProcessor(int outputTopic) + { + this.outputTopic = outputTopic; + currentBatch = new Dictionary(); + currentRequest = new StepRequest + { + TopicId = outputTopic + }; + } + + public async ValueTask HandleAsync(Event ev, CancellationToken token) + { + if (ev.Data.Equals("termination")) + { + currentRequest.ConsumedMessageOffsets.Add(ev.Offset); + currentRequest.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = ev.Data + }); + await capabilities.Step(currentRequest); + return; + } + var split = ev.Data.Split(":"); + Debug.Assert(split.Length == 3); + var term = split[0].Trim(); + Debug.Assert(term.Equals(SearchListStreamUtils.relevantSearchTerm)); + var region = split[1].Trim(); + var timestamp = long.Parse(split[2].Trim()); + if (currentBatchStartTime == -1) + currentBatchStartTime = timestamp; + + if (!currentBatch.TryGetValue(region, out var c)) + currentBatch[region] = 1; + else + currentBatch[region] = c + 1; + + if (timestamp > currentBatchStartTime + SearchListStreamUtils.WindowSizeMilli) + { + foreach (var (k, count) in currentBatch) + currentRequest.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = $"{k} : {count} : {largestTimestampInBatch}" + }); + + await capabilities.Step(currentRequest); + currentRequest = new StepRequest + { + TopicId = outputTopic + }; + currentBatch.Clear(); + currentBatchStartTime = timestamp; + } + + largestTimestampInBatch = Math.Max(largestTimestampInBatch, timestamp); + currentRequest.ConsumedMessageOffsets.Add(ev.Offset); + } + + public ValueTask HandleAwait() + { + return ValueTask.CompletedTask; + } + + public void OnRestart(PubsubCapabilities capabilities) + { + this.capabilities = capabilities; + currentRequest = new StepRequest + { + TopicId = outputTopic + }; + currentBatch.Clear(); + currentBatchStartTime = -1; + largestTimestampInBatch = 0; + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/AnomalyDetectionEventProcessor.cs b/cs/research/darq/EventProcessing/AnomalyDetectionEventProcessor.cs new file mode 100644 index 000000000..74dac62cb --- /dev/null +++ b/cs/research/darq/EventProcessing/AnomalyDetectionEventProcessor.cs @@ -0,0 +1,107 @@ +using System.Text; +using dse.services; +using Google.Protobuf; +using pubsub; +using ValueTask = System.Threading.Tasks.ValueTask; + +namespace EventProcessing; + +public class AnomalyDetectionEventProcessor : SpPubSubEventHandler +{ + private int outputTopic; + private PubsubCapabilities capabilities; + private Dictionary state; + private StepRequest currentStep; + private int batchedCount; + private Random random; + private double sampleRate; + + public AnomalyDetectionEventProcessor(int outputTopic, double sampleRate) + { + this.outputTopic = outputTopic; + state = new Dictionary(); + currentStep = new StepRequest + { + TopicId = outputTopic + }; + random = new Random(); + this.sampleRate = sampleRate; + } + + public async ValueTask HandleAsync(Event ev, CancellationToken token) + { + switch (ev.Type) + { + case DarqMessageType.In: + { + if (ev.Data.Equals("termination")) + { + currentStep.ConsumedMessageOffsets.Add(ev.Offset); + currentStep.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = ev.Data + }); + await CheckpointCurrentState(); + return; + } + // Console.WriteLine($"Received Message: {message}"); + var split = ev.Data.Split(":"); + var key = split[0]; + var count = long.Parse(split[1]); + var prevHash = state.GetValueOrDefault(key, 0); + state[key] = (31 * count + prevHash).GetHashCode(); + currentStep.ConsumedMessageOffsets.Add(ev.Offset); + + if (random.NextDouble() < sampleRate) + { + currentStep.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = ev.Data + }); + await CheckpointCurrentState(); + } + else if (batchedCount == 100) + { + await CheckpointCurrentState(); + } + return; + } + case DarqMessageType.Recovery: + { + var split = ev.Data.Split(":"); + state[split[0]] = int.Parse(split[1]); + return; + } + default: + throw new NotImplementedException(); + } + } + + public ValueTask HandleAwait() + { + return CheckpointCurrentState(); + } + + private async ValueTask CheckpointCurrentState() + { + if (state.Count == 0 && currentStep.ConsumedMessageOffsets.Count == 0) return; + // TODO(Tianyu): Need some API to easily GC recovery messages + foreach (var entry in state) + currentStep.RecoveryMessages.Add(ByteString.CopyFrom($"{entry.Key}:{entry.Value}", Encoding.UTF8)); + await capabilities.Step(currentStep); + currentStep = new StepRequest(); + batchedCount = 0; + } + + public void OnRestart(PubsubCapabilities capabilities) + { + this.capabilities = capabilities; + state = new Dictionary(); + currentStep = new StepRequest + { + TopicId = outputTopic + }; + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/Environments.cs b/cs/research/darq/EventProcessing/Environments.cs new file mode 100644 index 000000000..17c12018f --- /dev/null +++ b/cs/research/darq/EventProcessing/Environments.cs @@ -0,0 +1,154 @@ +using Azure.Storage.Blobs; +using FASTER.core; +using FASTER.libdpr; + +namespace EventProcessing; + +public interface IEnvironment +{ + Dictionary GetClusterMap(); + + Task PublishResultsAsync(string fileName, MemoryStream bytes); + + int GetPubsubServicePort(int hostId); + + public FileBasedCheckpointManager GetDarqCheckpointManager(int topicId); + + public IDevice GetDarqDevice(int topicId); + + public string GetDprFinderConnString(); + + public int GetDprFinderPort(); + + public PingPongDevice GetDprFinderDevice(); +} + +public class LocalDebugEnvironment : IEnvironment +{ + private readonly Dictionary clusterMap = new() + { + { 0, (0, "http://127.0.0.1:15721") }, + { 1, (0, "http://127.0.0.1:15721") }, + { 2, (0, "http://127.0.0.1:15721") }, + { 3, (0, "http://127.0.0.1:15721") } + }; + + public Dictionary GetClusterMap() + { + return clusterMap; + } + + + public int GetPubsubServicePort(int hostId) + { + return 15721 + hostId; + } + + public FileBasedCheckpointManager GetDarqCheckpointManager(int topicId) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\darq{topicId}"), removeOutdated: false); + result.PurgeAll(); + return result; + } + + public IDevice GetDarqDevice(int topicId) + { + return new ManagedLocalStorageDevice($"D:\\darq{topicId}.log", deleteOnClose: true); + } + + public string GetDprFinderConnString() => "http://127.0.0.1:15720"; + + public int GetDprFinderPort() => 15720; + + public PingPongDevice GetDprFinderDevice() + { + var device1 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + var device2 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + return new PingPongDevice(device1, device2, true); + } + + public Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + Console.WriteLine($"Results for {fileName}:"); + var reader = new StreamReader(bytes); + var text = reader.ReadToEnd(); + // Print to console + Console.Write(text); + return Task.CompletedTask; + } +} + +public class KubernetesLocalStorageEnvironment : IEnvironment +{ + private bool cleanStart; + private readonly Dictionary clusterMap = new() + { + { 0, (0, "http://pubsub0.dse.svc.cluster.local:15721") }, + { 1, (1, "http://pubsub1.dse.svc.cluster.local:15721") }, + { 2, (0, "http://pubsub0.dse.svc.cluster.local:15721") }, + { 3, (1, "http://pubsub1.dse.svc.cluster.local:15721") } + }; + + public KubernetesLocalStorageEnvironment(bool cleanStart) + { + this.cleanStart = cleanStart; + } + + public Dictionary GetClusterMap() + { + return clusterMap; + } + + public int GetPubsubServicePort(int hostId) + { + return 15721; + } + + public FileBasedCheckpointManager GetDarqCheckpointManager(int topicId) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"/mnt/plrs/darq{topicId}"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetDarqDevice(int topicId) + { + if (cleanStart) + ManagedLocalStorageDevice.RemoveIfPresent($"/mnt/plrs/darq{topicId}.log"); + return new ManagedLocalStorageDevice($"/mnt/plrs/darq{topicId}.log"); + } + + public string GetDprFinderConnString() => "http://dprfinder.dse.svc.cluster.local:15721"; + + public int GetDprFinderPort() => 15721; + + public PingPongDevice GetDprFinderDevice() + { + if (cleanStart) + { + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder1"); + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder2"); + } + + var device1 = new ManagedLocalStorageDevice("/mnt/plrs/finder1", recoverDevice: true); + var device2 = new ManagedLocalStorageDevice("/mnt/plrs/finder2", recoverDevice: true); + return new PingPongDevice(device1, device2, true); + } + + public async Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + await blobContainerClient.CreateIfNotExistsAsync(); + var blobClient = blobContainerClient.GetBlobClient(fileName); + + await blobClient.UploadAsync(bytes, overwrite: true); + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/EventProcessing.csproj b/cs/research/darq/EventProcessing/EventProcessing.csproj new file mode 100644 index 000000000..8c5ad5311 --- /dev/null +++ b/cs/research/darq/EventProcessing/EventProcessing.csproj @@ -0,0 +1,21 @@ + + + + Exe + net7.0 + enable + true + enable + + + + + + + + + + + + + diff --git a/cs/research/darq/EventProcessing/FilterAndMapProcessor.cs b/cs/research/darq/EventProcessing/FilterAndMapProcessor.cs new file mode 100644 index 000000000..b7682d4d7 --- /dev/null +++ b/cs/research/darq/EventProcessing/FilterAndMapProcessor.cs @@ -0,0 +1,75 @@ +using System.Diagnostics; +using dse.services; +using Newtonsoft.Json; +using pubsub; + +namespace EventProcessing; + +public class FilterAndMapEventProcessor : SpPubSubEventHandler +{ + private int outputTopic; + private PubsubCapabilities capabilities; + private pubsub.StepRequest currentBatch; + private int batchSize, numBatchedSteps = 0; + + public FilterAndMapEventProcessor(int outputTopic, int batchSize = 32) + { + this.outputTopic = outputTopic; + this.batchSize = batchSize; + currentBatch = new pubsub.StepRequest(); + } + + public async ValueTask HandleAsync(Event ev, CancellationToken token) + { + Debug.Assert(ev.Type == pubsub.DarqMessageType.In); + if (ev.Data.Equals("termination")) + { + // Forward termination signal + numBatchedSteps++; + currentBatch.ConsumedMessageOffsets.Add(ev.Offset); + currentBatch.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = ev.Data + }); + await Flush(); + return; + } + + var searchListItem = + JsonConvert.DeserializeObject(ev.Data); + Debug.Assert(searchListItem != null); + currentBatch.ConsumedMessageOffsets.Add(ev.Offset); + if (searchListItem.SearchTerm.Contains(SearchListStreamUtils.relevantSearchTerm)) + { + currentBatch.OutMessages.Add(new OutMessage + { + TopicId = outputTopic, + Event = $"{SearchListStreamUtils.relevantSearchTerm} : {SearchListStreamUtils.GetRegionCode(searchListItem.IP)} : {searchListItem.Timestamp}" + }); + } + + if (++numBatchedSteps == batchSize) + await Flush(); + } + + public ValueTask HandleAwait() + { + return Flush(); + } + + private async ValueTask Flush() + { + if (numBatchedSteps == 0) return; + await capabilities.Step(currentBatch); + currentBatch = new pubsub.StepRequest(); + numBatchedSteps = 0; + } + + public void OnRestart(PubsubCapabilities capabilities) + { + this.capabilities = capabilities; + currentBatch = new pubsub.StepRequest(); + numBatchedSteps = 0; + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/Program.cs b/cs/research/darq/EventProcessing/Program.cs new file mode 100644 index 000000000..628706308 --- /dev/null +++ b/cs/research/darq/EventProcessing/Program.cs @@ -0,0 +1,233 @@ +// See https://aka.ms/new-console-template for more information + +using System.Diagnostics; +using System.Net; +using CommandLine; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; +using Grpc.Net.Client; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using dse.services; +using FASTER.client; +using Microsoft.Extensions.Logging; + +namespace EventProcessing; + +public class Options +{ + [Option('t', "type", Required = true, + HelpText = "type of worker to launch")] + public string Type { get; set; } + + [Option('w', "workload-trace", Required = false, + HelpText = "Workload trace file to use")] + public string WorkloadTrace { get; set; } + + [Option('o', "output-name", Required = false, + HelpText = "Name of output file")] + public string OutputName { get; set; } + + [Option('h', "host-id", Required = false, + HelpText = "identifier of the service to launch")] + public int HostId { get; set; } + + [Option('s', "speculative", Required = false, Default = false, + HelpText = "whether services proceed speculatively")] + public bool Speculative { get; set; } + + + [Option('i', "checkpoint-interval", Required = false, Default = 10, + HelpText = "checkpoint interval")] + public int CheckpointInterval { get; set; } +} + +public class Program +{ + public static async Task Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + // IEnvironment environment = new LocalDebugEnvironment(); + var environment = new KubernetesLocalStorageEnvironment(true); + switch (options.Type.Trim()) + { + case "client": + await LaunchBenchmarkClient(options, environment); + break; + case "filter": + case "aggregate": + case "detection": + await LaunchProcessor(options, environment); + break; + case "worker": + await LaunchPubsubService(options, environment); + break; + case "dprfinder": + await LaunchDprFinder(options, environment); + break; + case "generate": + new SearchListDataGenerator().SetOutputFile("C:\\Users\\tianyu\\Desktop\\workloads\\EventProcessing\\workloads\\events-50k.txt") + .SetSearchTermRelevantProb(0.2) + .SetTrendParameters(0.1, 50000, 25000) + .SetSearchTermLength(80) + .SetThroughput(50000) + .SetNumSearchTerms(50000 * 30) + .Generate(); + break; + default: + throw new NotImplementedException(); + } + } + + private static async Task LaunchBenchmarkClient(Options options, IEnvironment environment) + { + var client = new SpPubSubServiceClient(environment.GetClusterMap()); + var stopwatch = new Stopwatch(); + var loader = new SearchListDataLoader(options.WorkloadTrace, client, 0, stopwatch); + var numRecords = loader.LoadData(); + _ = Task.Run(loader.Run); + var processingClient = new SpPubSubProcessorClient(3, client); + var measurementProcessor = new SearchListLatencyMeasurementProcessor(stopwatch, client); + _ = Task.Run(async () => await processingClient.StartProcessingAsync(measurementProcessor, false)); + await measurementProcessor.workloadTerminationed.Task; + var throughput = numRecords * 1000.0 / stopwatch.ElapsedMilliseconds; + await WriteLatencyResults(options, environment, measurementProcessor); + await WriteOtherResults(options, environment, throughput, measurementProcessor.totalBytesWritten); + } + + private static async Task WriteLatencyResults(Options options, IEnvironment environment, SearchListLatencyMeasurementProcessor processor) + { + using var memoryStream = new MemoryStream(); + await using var streamWriter = new StreamWriter(memoryStream); + foreach (var line in processor.results) + streamWriter.WriteLine(line.Value.Item2 - line.Value.Item1); + await streamWriter.FlushAsync(); + memoryStream.Position = 0; + + await environment.PublishResultsAsync($"{options.OutputName}-lat.csv", memoryStream); + } + + private static async Task WriteOtherResults(Options options, IEnvironment environment, double throughput, long bytesWritten) + { + using var memoryStream = new MemoryStream(); + await using var streamWriter = new StreamWriter(memoryStream); + streamWriter.WriteLine($"Throughput: {throughput}"); + streamWriter.WriteLine($"BytesWritten: {bytesWritten}"); + await streamWriter.FlushAsync(); + memoryStream.Position = 0; + + await environment.PublishResultsAsync($"{options.OutputName}-stats.csv", memoryStream); + } + + public static Task LaunchPubsubService(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetPubsubServicePort(options.HostId), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + + builder.Services.AddSingleton(new SpPubSubServiceSettings + { + clusterMap = environment.GetClusterMap(), + factory = (id, dprId) => new Darq(new DarqSettings + { + Me = new DarqId(id), + MyDpr = dprId, + DprFinder = new GrpcDprFinder(GrpcChannel.ForAddress(environment.GetDprFinderConnString())), + LogDevice = environment.GetDarqDevice(id), + LogCommitManager = environment.GetDarqCheckpointManager(id), + PageSize = 1L << 22, + MemorySize = 1L << 30, + SegmentSize = 1L << 30, + CheckpointPeriodMilli = options.CheckpointInterval, + RefreshPeriodMilli = 5, + FastCommitMode = true + }, new RwLatchVersionScheme()), + hostId = options.HostId, + }); + builder.Services.AddSingleton(); + + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + + builder.Services.AddSingleton(); + builder.Services.AddGrpc(); + var app = builder.Build(); + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + return app.RunAsync(); + } + + public static async Task LaunchProcessor(Options options, IEnvironment environment) + { + var client = new SpPubSubServiceClient(environment.GetClusterMap()); + var outputTopic = options.Type switch + { + "filter" => 1, + "aggregate" => 2, + "detection" => 3, + _ => throw new ArgumentOutOfRangeException() + }; + + var processingClient = new SpPubSubProcessorClient(outputTopic - 1, client); + SpPubSubEventHandler handler = options.Type switch + { + "filter" => new FilterAndMapEventProcessor(outputTopic), + "aggregate" => new AggregateEventProcessor(outputTopic), + "detection" => new AnomalyDetectionEventProcessor(outputTopic, 1.0), + _ => throw new ArgumentOutOfRangeException() + }; + await processingClient.StartProcessingAsync(handler, options.Speculative); + } + + public static async Task LaunchDprFinder(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetDprFinderPort(), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + using var dprFinderServiceDevice = environment.GetDprFinderDevice(); + builder.Services.AddSingleton(dprFinderServiceDevice); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + + builder.Services.AddGrpc(); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/SearchListJson.cs b/cs/research/darq/EventProcessing/SearchListJson.cs new file mode 100644 index 000000000..945f65b96 --- /dev/null +++ b/cs/research/darq/EventProcessing/SearchListJson.cs @@ -0,0 +1,9 @@ +namespace EventProcessing; + +public class SearchListJson +{ + public long UserId { get; set; } + public string SearchTerm { get; set; } + public string IP { get; set; } + public long Timestamp { get; set; } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/SearchListLatencyMeasurementProcessor.cs b/cs/research/darq/EventProcessing/SearchListLatencyMeasurementProcessor.cs new file mode 100644 index 000000000..d0d951173 --- /dev/null +++ b/cs/research/darq/EventProcessing/SearchListLatencyMeasurementProcessor.cs @@ -0,0 +1,47 @@ +using System.Diagnostics; +using dse.services; +using pubsub; + +namespace EventProcessing; + +public class SearchListLatencyMeasurementProcessor : SpPubSubEventHandler +{ + public Dictionary results = new(); + public TaskCompletionSource workloadTerminationed = new(); + public long totalBytesWritten = 0; + private Stopwatch stopwatch; + private SpPubSubServiceClient client; + + public SearchListLatencyMeasurementProcessor(Stopwatch stopwatch, SpPubSubServiceClient client) + { + this.stopwatch = stopwatch; + this.client = client; + } + + public async ValueTask HandleAsync(Event ev, CancellationToken token) + { + if (ev.Data.Equals("termination")) + { + stopwatch.Stop(); + for (var i = 0; i < 4; i++) + totalBytesWritten += (await client.GetNumBytesWritten(i)).NumBytes; + workloadTerminationed.SetResult(); + return; + } + var split = ev.Data.Split(":"); + var timestamp = long.Parse(split[2]); + var endTime = stopwatch.ElapsedMilliseconds; + results[ev.Data] = (timestamp, endTime); + // Console.WriteLine($"Received {ev.Data}, {timestamp}, {endTime}"); + } + + public ValueTask HandleAwait() + { + return ValueTask.CompletedTask; + } + + + public void OnRestart(PubsubCapabilities capabilities) + { + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/SearchListStreamUtils.cs b/cs/research/darq/EventProcessing/SearchListStreamUtils.cs new file mode 100644 index 000000000..5397e59b4 --- /dev/null +++ b/cs/research/darq/EventProcessing/SearchListStreamUtils.cs @@ -0,0 +1,261 @@ +using System.Diagnostics; +using System.Text; +using dse.services; +using MathNet.Numerics.Distributions; +using Newtonsoft.Json; +using pubsub; + +namespace EventProcessing; + +public class SearchListStreamUtils +{ + public const string relevantSearchTerm = "fever"; + + public const int WindowSizeMilli = 500; + + public const int numRegions = 8; + + public static int GetRegionCode(string ip) + { + return ip.Split(".").Select(int.Parse).Sum() % numRegions; + } +} + +public class SearchListDataGenerator +{ + private string outputFile; + + private double trendProb, relevantProb; + + // In number of searches + private int avgTrendLength, stdTrendLength; + private int searchTermLength; + private int termsPerSecond; + private int numSearchTerms; + + public SearchListDataGenerator SetOutputFile(string outputFile) + { + this.outputFile = outputFile; + return this; + } + + public SearchListDataGenerator SetSearchTermRelevantProb(double relevantProb) + { + this.relevantProb = relevantProb; + return this; + } + + public SearchListDataGenerator SetTrendParameters(double trendProb, int avgTrendLength, int stdTrendLength) + { + this.trendProb = trendProb; + this.avgTrendLength = avgTrendLength; + this.stdTrendLength = stdTrendLength; + return this; + } + + public SearchListDataGenerator SetThroughput(int termsPerSecond) + { + this.termsPerSecond = termsPerSecond; + return this; + } + + public SearchListDataGenerator SetSearchTermLength(int searchTermLength) + { + this.searchTermLength = searchTermLength; + return this; + } + + public SearchListDataGenerator SetNumSearchTerms(int numSearchTerms) + { + this.numSearchTerms = numSearchTerms; + return this; + } + + private static string GenerateRandomIp(Random random) + { + var component1 = random.Next(256); + var component2 = random.Next(256); + var component3 = random.Next(256); + var component4 = random.Next(256); + return $"{component1}.{component2}.{component3}.{component4}"; + } + + private string PopulateSearchTerm(Random random, StringBuilder builder, bool relevant) + { + builder.Clear(); + var length = searchTermLength; + if (relevant) + { + builder.Append(SearchListStreamUtils.relevantSearchTerm); + length -= SearchListStreamUtils.relevantSearchTerm.Length; + } + + for (var i = 0; i < length; i++) + // Generate ascii alphabets + builder.Append((char)random.Next(97, 123)); + return builder.ToString(); + } + + private string[] BuildRegionReverseLookUpTable(Random random) + { + var result = new string[SearchListStreamUtils.numRegions]; + for (var i = 0; i < SearchListStreamUtils.numRegions; i++) + { + string ip; + do + { + ip = GenerateRandomIp(random); + } while (SearchListStreamUtils.GetRegionCode(ip) != i); + + result[i] = ip; + } + + return result; + } + + private void ComputeTrend(int numTermsGenerated, Random random, int[] trendTable) + { + for (var i = 0; i < SearchListStreamUtils.numRegions; i++) + { + if (trendTable[i] < numTermsGenerated) trendTable[i] = -1; + if (trendTable[i] == -1 && random.NextDouble() < trendProb) + trendTable[i] = i + (int)Math.Max(0, Normal.Sample(random, avgTrendLength, stdTrendLength)); + } + } + + public void Generate() + { + var random = new Random(); + // Pre-populate some reverse-lookup table for regions; + var regionTable = BuildRegionReverseLookUpTable(random); + var trendTable = new int[SearchListStreamUtils.numRegions]; + + try + { + File.Delete(outputFile); + } + catch (Exception) + { + } + + using var writer = new StreamWriter(new FileStream(outputFile, FileMode.Create)); + var builder = new StringBuilder(); + var jsonObject = new SearchListJson(); + var numSeconds = numSearchTerms / termsPerSecond; + var searchesPerSecond = new int[numSeconds]; + Poisson.Samples(random, searchesPerSecond, termsPerSecond); + for (var time = 0; time < numSeconds; time++) + { + var milliStep = 1000.0 / searchesPerSecond[time]; + for (var i = 0; i < searchesPerSecond[time]; i++) + { + ComputeTrend(i, random, trendTable); + var region = random.Next(SearchListStreamUtils.numRegions); + // triple the probability that we generate a relevant search term by 3 times if trending + var prob = trendTable[region] == -1 ? relevantProb : relevantProb * 3; + var relevant = random.NextDouble() < prob; + jsonObject.SearchTerm = PopulateSearchTerm(random, builder, relevant); + jsonObject.UserId = random.NextInt64(); + jsonObject.IP = regionTable[region]; + jsonObject.Timestamp = time * 1000 + (int)Math.Floor(milliStep * i); + writer.WriteLine(JsonConvert.SerializeObject(jsonObject)); + } + } + } +} + +public class SearchListDataLoader +{ + private string filename; + private List rawJsons = new(); + private List parsedJsons = new(); + private SpPubSubServiceClient client; + private int topicName; + private Stopwatch stopwatch; + + public SearchListDataLoader(string filename, SpPubSubServiceClient client, int topicName, Stopwatch stopwatch) + { + this.filename = filename; + this.client = client; + this.topicName = topicName; + this.stopwatch = stopwatch; + } + + public int LoadData() + { + Console.WriteLine("Started loading json messages from file"); + rawJsons = File.ReadLines(filename).ToList(); + parsedJsons = rawJsons.Select(JsonConvert.DeserializeObject).ToList()!; + Console.WriteLine($"Loading of {parsedJsons.Count} messages complete"); + return parsedJsons.Count; + } + + public async Task Run() + { + var semaphore = new SemaphoreSlim(128, 128); + stopwatch.Start(); + var batched = new EnqueueRequest + { + ProducerId = 0, + TopicId = topicName, + FireAndForget = true + }; + for (var i = 0; i < parsedJsons.Count; i++) + { + var json = parsedJsons[i]; + var currentTime = stopwatch.ElapsedMilliseconds; + while (currentTime < json.Timestamp) + { + if (batched.Events.Count != 0) + { + var batched1 = batched; + var now = stopwatch.ElapsedMilliseconds; + await semaphore.WaitAsync(); + _ = Task.Run(async () => + { + await client.EnqueueEventsAsync(batched1); + semaphore.Release(); + // Console.WriteLine($"Batched {batched1.Events.Count} requests, and request returned in {stopwatch.ElapsedMilliseconds - now} ms"); + }); + batched = new EnqueueRequest + { + ProducerId = 0, + TopicId = topicName, + FireAndForget = true + }; + } + Thread.Yield(); + currentTime = stopwatch.ElapsedMilliseconds; + } + batched.SequenceNum = i; + batched.Events.Add(rawJsons[i]); + if (batched.Events.Count >= 64) + { + await semaphore.WaitAsync(); + var now = stopwatch.ElapsedMilliseconds; + var batched1 = batched; + _ = Task.Run(async () => + { + await client.EnqueueEventsAsync(batched1); + semaphore.Release(); + // Console.WriteLine($"Batched {batched1.Events.Count} requests, and request returned in {stopwatch.ElapsedMilliseconds - now} ms"); + }); + batched = new EnqueueRequest + { + ProducerId = 0, + TopicId = topicName, + FireAndForget = true + }; + } + } + Console.WriteLine("########## Finished publishing messages"); + var termination = new EnqueueRequest + { + ProducerId = 0, + SequenceNum = parsedJsons.Count, + TopicId = topicName + }; + termination.Events.Add($"termination"); + await client.EnqueueEventsAsync(termination); + } +} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-storage/Chart.yaml b/cs/research/darq/EventProcessing/helm-storage/Chart.yaml new file mode 100644 index 000000000..50a0ff477 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-storage/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: events-storage +description: Helm chart for EventProcessing workload (storage resources) +type: application +version: 0.1.0 +appVersion: "1.16.0" \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-storage/templates/dprfinder-pvc.yaml b/cs/research/darq/EventProcessing/helm-storage/templates/dprfinder-pvc.yaml new file mode 100644 index 000000000..d91dc5cd9 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-storage/templates/dprfinder-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: dprfinder-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: premium-lrs + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-storage/templates/pubsub-pvc.yaml b/cs/research/darq/EventProcessing/helm-storage/templates/pubsub-pvc.yaml new file mode 100644 index 000000000..f87d585a0 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-storage/templates/pubsub-pvc.yaml @@ -0,0 +1,14 @@ +{{- range .Values.workers }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pubsub{{ .num }}-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: premium-lrs + resources: + requests: + storage: 10Gi +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-storage/templates/storageclass.yaml b/cs/research/darq/EventProcessing/helm-storage/templates/storageclass.yaml new file mode 100644 index 000000000..9e19c8900 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-storage/templates/storageclass.yaml @@ -0,0 +1,9 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: premium-lrs +provisioner: kubernetes.io/azure-disk +parameters: + storageaccounttype: Premium_LRS +reclaimPolicy: Delete +volumeBindingMode: Immediate \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-storage/values.yaml b/cs/research/darq/EventProcessing/helm-storage/values.yaml new file mode 100644 index 000000000..bb0d1a8df --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-storage/values.yaml @@ -0,0 +1,12 @@ +conn_string: foo +results_conn_string: foo +workload: workload-small + +workers: + - num: 0 + - num: 1 + +processors: + - name: aggregate + - name: detection + - name: filter \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/Chart.yaml b/cs/research/darq/EventProcessing/helm-workload/Chart.yaml new file mode 100644 index 000000000..011526979 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: events-workload +description: Helm chart for EventProcessing workload (compute resources) +type: application +version: 0.1.0 +appVersion: "1.16.0" \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/client-job.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/client-job.yaml new file mode 100644 index 000000000..a35612181 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/client-job.yaml @@ -0,0 +1,44 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: client +spec: + template: + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-all + image: bitnami/kubectl + command: [ "sh", "-c" ] + args: + - > + while true; do + # Check if all deployments have at least one ready replica + notReadyCount=$(kubectl get pods -l type=processor -n dse | grep -c "0/1") + if [ "$notReadyCount" -eq 0 ]; then + echo "All deployments are ready."; + break; + else + echo "Waiting for all deployments to be ready. $notReadyCount deployments are not ready."; + sleep 1; + fi; + done; + containers: + - command: + - "EventProcessing/EventProcessing" + args: + - "-t" + - "client" + - "-w" + - "{{.Values.experiment}}/workloads/{{.Values.workload}}.txt" + - "-o" + - "{{.Values.experiment}}-{{.Values.workload}}-results-c{{.Values.checkpoint_interval}}{{- if $.Values.speculative }}speculative{{- end }}" + image: tianyuli96/faster:latest + name: client + ports: + - containerPort: 15721 + envFrom: + - configMapRef: + name: env-config + restartPolicy: Never diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-deployment.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-deployment.yaml new file mode 100644 index 000000000..ffad1d5fd --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + replicas: 1 + selector: + matchLabels: + service: dprfinder + strategy: {} + template: + metadata: + labels: + service: dprfinder + spec: + priorityClassName: high-priority + containers: + - command: + - "EventProcessing/EventProcessing" + args: + - "-t" + - "dprfinder" + image: tianyuli96/faster:latest + name: dprfinder + ports: + - containerPort: 15721 + volumeMounts: + - name: dprfinderstorage + mountPath: "/mnt/plrs" + envFrom: + - configMapRef: + name: env-config + volumes: + - name: dprfinderstorage + persistentVolumeClaim: + claimName: dprfinder-pvc + nodeSelector: + nodepool: dsebench + restartPolicy: Always \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-service.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-service.yaml new file mode 100644 index 000000000..5268bde33 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/dprfinder-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + ports: + - name: "15721" + port: 15721 + targetPort: 15721 + selector: + service: dprfinder diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/env-config.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/env-config.yaml new file mode 100644 index 000000000..a876ec967 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/env-config.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: env-config + namespace: dse +data: + AZURE_CONN_STRING: "{{ .Values.conn_string }}" + AZURE_RESULTS_CONN_STRING: "{{ .Values.results_conn_string }}" \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/processor-jobs.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/processor-jobs.yaml new file mode 100644 index 000000000..8b2fca71e --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/processor-jobs.yaml @@ -0,0 +1,51 @@ +{{- range .Values.processors }} +apiVersion: batch/v1 +kind: Job +metadata: + labels: + type: processor + name: {{ .name }} +spec: + template: + metadata: + labels: + type: processor + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-pubsub + image: bitnami/kubectl + command: [ "sh", "-c" ] + args: + - > + while true; do + # Check if all pubsub deployments have at least one ready replica + notReadyCount=$(kubectl get deployment -l type=pubsub -n dse | grep -c "0/1") + if [ "$notReadyCount" -eq 0 ]; then + echo "All deployments are ready."; + break; + else + echo "Waiting for all deployments to be ready. $notReadyCount deployments are not ready."; + sleep 1; + fi; + done; + containers: + - command: + - "EventProcessing/EventProcessing" + args: + - "-t" + - "{{ .name }}" + {{- if $.Values.speculative }} + - "-s" + {{- end }} + image: tianyuli96/faster:latest + name: {{ .name }} + ports: + - containerPort: 15721 + envFrom: + - configMapRef: + name: env-config + restartPolicy: Never +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-deployment.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-deployment.yaml new file mode 100644 index 000000000..f28d3b984 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-deployment.yaml @@ -0,0 +1,55 @@ +{{- range .Values.workers }} +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + type: pubsub + service: pubsub{{ .num }} + name: pubsub{{ .num }} +spec: + replicas: 1 + selector: + matchLabels: + service: pubsub{{ .num }} + strategy: {} + template: + metadata: + labels: + type: pubsub + service: pubsub{{ .num }} + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-for-dprfinder{{ .num }} + image: bitnami/kubectl + command: [ 'sh', '-c', 'until kubectl get deployment -l service=dprfinder -n dse | grep -q "1/1"; do echo waiting for deployment-b; sleep 1; done;' ] + containers: + - command: + - "EventProcessing/EventProcessing" + args: + - "-t" + - "worker" + - "-h" + - "{{ .num }}" + - "-i" + - "{{ $.Values.checkpoint_interval }}" + image: tianyuli96/faster:latest + name: pubsub{{ .num }} + ports: + - containerPort: 15721 + volumeMounts: + - name: pubsub{{ .num }}storage + mountPath: "/mnt/plrs" + envFrom: + - configMapRef: + name: env-config + volumes: + - name: pubsub{{ .num }}storage + persistentVolumeClaim: + claimName: pubsub{{ .num }}-pvc + nodeSelector: + nodepool: dsebench + restartPolicy: Always +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-service.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-service.yaml new file mode 100644 index 000000000..fda0c7950 --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/pubsub-service.yaml @@ -0,0 +1,16 @@ +{{- range .Values.workers }} +apiVersion: v1 +kind: Service +metadata: + labels: + service: pubsub{{ .num }} + name: pubsub{{ .num }} +spec: + ports: + - name: "15721" + port: 15721 + targetPort: 15721 + selector: + service: pubsub{{ .num }} +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/templates/service-role-auth.yaml b/cs/research/darq/EventProcessing/helm-workload/templates/service-role-auth.yaml new file mode 100644 index 000000000..f4291a8df --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/templates/service-role-auth.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: deployment-watcher + namespace: dse +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: deployment-reader + namespace: dse +rules: + - apiGroups: [ "apps", "" ] + resources: [ "deployments", "pods" ] + verbs: [ "get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: read-deployments + namespace: dse +subjects: + - kind: ServiceAccount + name: deployment-watcher + namespace: dse +roleRef: + kind: Role + name: deployment-reader + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/cs/research/darq/EventProcessing/helm-workload/values.yaml b/cs/research/darq/EventProcessing/helm-workload/values.yaml new file mode 100644 index 000000000..9ba39b56e --- /dev/null +++ b/cs/research/darq/EventProcessing/helm-workload/values.yaml @@ -0,0 +1,16 @@ +conn_string: foo +results_conn_string: foo +experiment: EventProcessing-latency +workload: placeholder +checkpoint_interval: 10 + +workers: + - num: 0 + - num: 1 + +processors: + - name: aggregate + - name: detection + - name: filter + +speculative: false \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/ExampleServices.csproj b/cs/research/darq/ExampleServices/ExampleServices.csproj new file mode 100644 index 000000000..44a972f18 --- /dev/null +++ b/cs/research/darq/ExampleServices/ExampleServices.csproj @@ -0,0 +1,65 @@ + + + + net7.0 + enable + true + + + + + Server, Client + Public + True + True + obj\Debug\net7.0\ + MSBuild:Compile + + + Server, Client + Public + True + True + obj\Debug\net7.0\ + MSBuild:Compile + + + Server, Client + Public + True + True + obj\Debug\net7.0\ + MSBuild:Compile + + + Server,Client + Public + True + True + obj\Debug\net7.0\ + MSBuild:Compile + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + diff --git a/cs/research/darq/ExampleServices/spfaster/FasterKvReservationService.cs b/cs/research/darq/ExampleServices/spfaster/FasterKvReservationService.cs new file mode 100644 index 000000000..468665ed9 --- /dev/null +++ b/cs/research/darq/ExampleServices/spfaster/FasterKvReservationService.cs @@ -0,0 +1,423 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using FASTER.common; +using FASTER.core; +using FASTER.libdpr; +using Grpc.Core; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using protobuf; + +namespace dse.services; + +public enum TableId : byte +{ + OFFERINGS, + RESERVATIONS +} + +public struct Key : IFasterEqualityComparer +{ + internal ulong key; + + public Key(TableId table, long id) + { + // We reserve the first byte for table id + Debug.Assert(((ulong)id & 0xFF00000000000000L) == 0L); + key = ((ulong)(byte)table << 56) | (ulong)id; + } + + internal Key(ulong key) + { + this.key = key; + } + + public TableId GetTable() => (TableId)((key & 0xFF00000000000000L) >> 56); + + public long GetId() => (long)(key & 0xFFFFFFFFFFFFFFL); + + public long GetHashCode64(ref Key k) + { + return Utility.GetHashCode((long)k.key); + } + + public bool Equals(ref Key k1, ref Key k2) + { + return k1.key == k2.key; + } +} + +public struct Value +{ + private Key key; + private long field1, field2, field3; + + private Value(Key key, long field1, long field2, long field3) + { + this.key = key; + this.field1 = field1; + this.field2 = field2; + this.field3 = field3; + } + + public static Value CreateOffering(long offeringId, long entityId, int price, int remainingCount) + { + return new Value(new Key(TableId.OFFERINGS, offeringId), entityId, price, remainingCount); + } + + public static Value CreateReservation(long reservationId, long offeringId, long customerId, int count) + { + return new Value(new Key(TableId.RESERVATIONS, reservationId), offeringId, customerId, count); + } + + public bool TryDecrementCount(int by) + { + Debug.Assert(key.GetTable() == TableId.OFFERINGS); + long currentCount = 0, decremented = 0; + do + { + currentCount = field3; + if (currentCount < by) return false; + decremented = currentCount - by; + } while (Interlocked.CompareExchange(ref field3, decremented, currentCount) != currentCount); + + return true; + } +} + +public class FasterKvReservationStartFile +{ + public string file; +} + +public class FasterKvReservationStateObject : StateObject +{ + public FasterKV kv; + private Guid indexCheckpointToken = default; + private ConcurrentDictionary tokenMappings = new(); + + public FasterKvReservationStateObject(FasterKV kv, + IVersionScheme versionScheme, DprWorkerOptions options) : base(versionScheme, options) + { + this.kv = kv; + } + + public override void Dispose() + { + } + + public override void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + // TODO(Tianyu): Do something about index checkpoints + Console.WriteLine($"Performing checkpoint for version {version}"); + Guid token; + // If return is false, this means the previous checkpoint is still running and we should not advance more + var success = kv.TryTakeDprStyleCheckpoint(version, metadata, onPersist, out token); + Debug.Assert(success); + tokenMappings[version] = token; + Task.Run(() => kv.CompleteCheckpointAsync()); + } + + public override void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + // TODO(Tianyu): Figure out how to do advanced non-blocking rollback + tokenMappings.Clear(); + foreach (var guid in kv.CheckpointManager.GetLogCheckpointTokens()) + { + using StreamReader s = new(new MemoryStream(kv.CheckpointManager.GetLogCheckpointMetadata(guid, null))); + s.ReadLine(); // version + s.ReadLine(); // checksum + s.ReadLine(); // guid + s.ReadLine(); // useSnapshotFile + var checkpointVersion = long.Parse(s.ReadLine()); + tokenMappings[checkpointVersion] = guid; + } + + kv.Recover(default, tokenMappings[version]); + metadata = kv.CommitCookie; + } + + public override void PruneVersion(long version) + { + if (!tokenMappings.TryRemove(version, out var guid)) return; + kv.CheckpointManager.Purge(guid); + } + + public override IEnumerable> GetUnprunedVersions() + { + return kv.CheckpointManager.GetLogCheckpointTokens().Select(guid => + { + using StreamReader s = new(new MemoryStream(kv.CheckpointManager.GetLogCheckpointMetadata(guid, null))); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + s.ReadLine(); + var numSessions = int.Parse(s.ReadLine()); + // We don't use recoverable sessions in DPR version of Faster + Debug.Assert(numSessions == 0); + // Read object log segment offsets + var numSegments = int.Parse(s.ReadLine()); + // We don't use object log in DPR version of Faster + Debug.Assert(numSegments == 0); + var cookie = s.ReadToEnd(); + var metadata = cookie.Length == 0 ? null : Convert.FromBase64String(cookie); + return new Memory(metadata); + }); + } +} + +public class ReserveFunctions : FunctionsBase +{ + public override bool ConcurrentReader(ref Key key, ref int input, ref Value value, ref bool dst, + ref ReadInfo readInfo) + { + // Should never be called + throw new NotImplementedException(); + } + + public override bool SingleReader(ref Key key, ref int input, ref Value value, ref bool dst, ref ReadInfo readInfo) + { + // Should never be called + throw new NotImplementedException(); + } + + public override bool ConcurrentWriter(ref Key key, ref int input, ref Value src, ref Value dst, ref bool output, + ref UpsertInfo upsertInfo) + { + // Not allowed + upsertInfo.Action = UpsertAction.CancelOperation; + return false; + } + + public override bool SingleWriter(ref Key key, ref int input, ref Value src, ref Value dst, ref bool output, + ref UpsertInfo upsertInfo, + WriteReason reason) + { + dst = src; + return true; + } + + public override bool InitialUpdater(ref Key key, ref int input, ref Value value, ref bool output, + ref RMWInfo rmwInfo) + { + // Should not be called in workload + throw new NotImplementedException(); + } + + public override bool CopyUpdater(ref Key key, ref int input, ref Value oldValue, ref Value newValue, + ref bool output, + ref RMWInfo rmwInfo) + { + newValue = oldValue; + output = newValue.TryDecrementCount(input); + return true; + } + + public override bool InPlaceUpdater(ref Key key, ref int input, ref Value value, ref bool output, + ref RMWInfo rmwInfo) + { + output = value.TryDecrementCount(input); + return true; + } +} + +public class FasterKvReservationBackgroundService : BackgroundService +{ + private FasterKvReservationStateObject backend; + + private ThreadLocalObjectPool>> + sessions; + + private ILogger logger; + private FasterKvReservationStartFile file; + public FasterKvReservationBackgroundService(FasterKvReservationStateObject backend, + FasterKvReservationStartFile file, ILogger logger) + { + this.backend = backend; + this.file = file; + this.logger = logger; + sessions = + new ThreadLocalObjectPool< + ClientSession>>(() => + this.backend.kv.NewSession(new ReserveFunctions())); + } + + private void LoadFromFile(string filename) + { + using var reader = new StreamReader(filename); + var s = sessions.Checkout(); + for (var line = reader.ReadLine(); line != null; line = reader.ReadLine()) + { + var parts = line.Split(','); + var offeringId = long.Parse(parts[0]); + var entityId = long.Parse(parts[1]); + var price = int.Parse(parts[2]); + var count = int.Parse(parts[3]); + + var key = new Key(TableId.OFFERINGS, offeringId); + var val = Value.CreateOffering(offeringId, entityId, price, count); + var status = s.Upsert(ref key, ref val); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + } + + sessions.Return(s); + backend.ForceCheckpoint(); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + logger.LogInformation("Faster service is starting..."); + backend.ConnectToCluster(out var restored); + if (!restored && !file.file.Equals("")) + LoadFromFile(file.file); + + await Task.Delay(Timeout.InfiniteTimeSpan, stoppingToken); + logger.LogInformation("Faster service is stopping..."); + } + + public Task MakeReservation(ReservationRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingId); + var reservationCount = request.Count; + var success = false; + var status = s.RMW(ref offeringKey, ref reservationCount, ref success); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + if (!success) + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + + var reservationsKey = new Key(TableId.RESERVATIONS, request.ReservationId); + var reservationsEntry = Value.CreateReservation(request.ReservationId, request.OfferingId, + request.CustomerId, request.Count); + status = s.Upsert(ref reservationsKey, ref reservationsEntry); + if (status.IsCanceled) + { + // this reservation is a duplicate, roll back earlier update + reservationCount = -reservationCount; + status = s.RMW(ref offeringKey, ref reservationCount, ref success); + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + } + else if (status.IsCompletedSuccessfully) + { + return Task.FromResult(new ReservationResponse + { + Ok = true + }); + } + else + throw new NotImplementedException(); + } + finally + { + sessions.Return(s); + } + } + + public Task CancelReservation(ReservationRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingId); + var reservationCount = request.Count; + var reservationsKey = new Key(TableId.RESERVATIONS, request.ReservationId); + + var status = s.Delete(ref reservationsKey); + if (status.NotFound) + { + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + } + + // Add updates back to count + reservationCount = -reservationCount; + var success = false; + status = s.RMW(ref offeringKey, ref reservationCount, ref success); + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new ReservationResponse + { + Ok = true + }); + } + finally + { + sessions.Return(s); + } + } + + public Task AddOffering(AddOfferingRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingToAdd.OfferingId); + var offeringEntry = Value.CreateOffering(request.OfferingToAdd.OfferingId, request.OfferingToAdd.EntityId, + request.OfferingToAdd.Price, request.OfferingToAdd.RemainingCount); + var status = s.Upsert(ref offeringKey, ref offeringEntry); + if (status.IsCanceled) + return Task.FromResult(new AddOfferingResponse + { + Ok = false + }); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new AddOfferingResponse + { + Ok = true + }); + } + finally + { + sessions.Return(s); + } + } +} + +public class FasterKvReservationService : FasterKVReservationService.FasterKVReservationServiceBase +{ + private FasterKvReservationBackgroundService faster; + + public FasterKvReservationService(FasterKvReservationBackgroundService faster) + { + this.faster = faster; + } + + public override Task MakeReservation(ReservationRequest request, ServerCallContext context) + { + return faster.MakeReservation(request); + } + + public override Task CancelReservation(ReservationRequest request, ServerCallContext context) + { + return faster.CancelReservation(request); + } + + public override Task AddOffering(AddOfferingRequest request, ServerCallContext context) + { + return faster.AddOffering(request); + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/spfaster/fasterkv-reservable.proto b/cs/research/darq/ExampleServices/spfaster/fasterkv-reservable.proto new file mode 100644 index 000000000..18924390e --- /dev/null +++ b/cs/research/darq/ExampleServices/spfaster/fasterkv-reservable.proto @@ -0,0 +1,40 @@ +syntax = "proto3"; +option csharp_namespace = "protobuf"; +message Offering { + int64 offeringId = 1; + int64 entityId = 2; + int32 price = 4; + int32 remainingCount = 5; +} + +message Reservation { + int64 reservationId = 1; + int64 offeringId = 2; + int64 customerId = 3; + int32 count = 4; +} + +message ReservationRequest { + int64 reservationId = 1; + int64 offeringId = 2; + int64 customerId = 3; + int32 count = 4; +} + +message ReservationResponse { + bool ok = 1; +} + +message AddOfferingRequest { + Offering offeringToAdd = 1; +} + +message AddOfferingResponse { + bool ok = 1; +} + +service FasterKVReservationService { + rpc MakeReservation(ReservationRequest) returns (ReservationResponse); + rpc CancelReservation(ReservationRequest) returns (ReservationResponse); + rpc AddOffering(AddOfferingRequest) returns (AddOfferingResponse); +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/splog/SplogService.cs b/cs/research/darq/ExampleServices/splog/SplogService.cs new file mode 100644 index 000000000..a0f2c8ac3 --- /dev/null +++ b/cs/research/darq/ExampleServices/splog/SplogService.cs @@ -0,0 +1,163 @@ +using System.Diagnostics; +using FASTER.core; +using FASTER.libdpr; +using Google.Protobuf; +using Grpc.Core; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using protobuf; +using Status = Grpc.Core.Status; + +namespace dse.services.splog; + +public class SpeculativeLog : StateObject +{ + private FasterLogSettings settings; + public FasterLog log; + + public SpeculativeLog(FasterLogSettings settings, IVersionScheme versionScheme, DprWorkerOptions options) : base(versionScheme, options) + { + this.settings = settings; + log = new FasterLog(settings); + } + + public override void Dispose() + { + log.Dispose(); + } + + public override void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + log.CommitStrongly(out _, out _, false, metadata.ToArray(), version, onPersist); + } + + public override void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + log = new FasterLog(settings); + log.Recover(version); + metadata = log.RecoveredCookie; + } + + public override void PruneVersion(long version) + { + settings.LogCommitManager.RemoveCommit(version); + } + + public override IEnumerable> GetUnprunedVersions() + { + var commits = settings.LogCommitManager.ListCommits().ToList(); + return commits.Select(commitNum => + { + // TODO(Tianyu): hacky + var newLog = new FasterLog(settings); + newLog.Recover(commitNum); + var commitCookie = newLog.RecoveredCookie; + newLog.Dispose(); + return new Memory(commitCookie); + }); + } +} + +public class SplogBackgroundService : BackgroundService +{ + private SpeculativeLog backend; + private ILogger logger; + + public SplogBackgroundService(SpeculativeLog backend, ILogger logger) + { + this.backend = backend; + this.logger = logger; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + logger.LogInformation("Splog is starting..."); + backend.ConnectToCluster(out _); + await Task.Delay(Timeout.InfiniteTimeSpan, stoppingToken); + logger.LogInformation("Splog is shutting down"); + } + + public Task Append(SplogAppendRequest request) + { + var lsn = backend.log.Enqueue(request.Entry.Span); + return Task.FromResult(new SplogAppendResponse + { + Ok = true, + Lsn = lsn + }); + } + + private unsafe bool TryReadOneEntry(FasterLogScanIterator iterator, SplogScanResponse response) + { + if (!iterator.UnsafeGetNext(out var entry, out var entryLength, out var lsn, out var nextAddress)) return false; + response.Entries.Add(new SplogEntry + { + Entry = ByteString.CopyFrom(new Span(entry, entryLength)), + Lsn = lsn + }); + response.NextLsn = nextAddress; + iterator.UnsafeRelease(); + return true; + } + + private async Task NextEntryWithTimeOut(FasterLogScanIterator iterator, int timeoutMilli, Stopwatch timer) + { + var currentTime = timer.ElapsedMilliseconds; + if (currentTime > timeoutMilli) return false; + var nextEntry = iterator.WaitAsync().AsTask(); + var session = backend.DetachFromWorkerAndPauseAction(); + var result = await Task.WhenAny(nextEntry, Task.Delay((int)(timeoutMilli - currentTime))); + if (!await backend.TryMergeAndStartActionAsync(session)) + throw new RpcException(Status.DefaultCancelled); + return result == nextEntry; + } + + public async Task Scan(SplogScanRequest request) + { + var responseObject = new SplogScanResponse(); + var scanner = backend.log.Scan(request.StartLsn, request.EndLsn, recover: false, scanUncommitted: true); + var timer = Stopwatch.StartNew(); + for (var i = 0; i < request.MaxChunkSize; i++) + { + if (TryReadOneEntry(scanner, responseObject)) continue; + if (!await NextEntryWithTimeOut(scanner, request.TimeoutMilli, timer)) break; + } + + return responseObject; + } + + public Task Truncate(SplogTruncateRequest request) + { + backend.log.TruncateUntil(request.NewStartLsn); + return Task.FromResult(new SplogTruncateResponse + { + Ok = true + }); + } +} + +public class SplogService : protobuf.SplogService.SplogServiceBase +{ + private SplogBackgroundService backend; + + public SplogService(SplogBackgroundService backend) + { + this.backend = backend; + } + + public override Task Append(SplogAppendRequest request, ServerCallContext context) + { + return backend.Append(request); + } + + + public override Task Scan(SplogScanRequest request, ServerCallContext context) + { + return backend.Scan(request); + } + + public override Task Truncate(SplogTruncateRequest request, ServerCallContext context) + { + return backend.Truncate(request); + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/splog/splog.proto b/cs/research/darq/ExampleServices/splog/splog.proto new file mode 100644 index 000000000..66a138d7f --- /dev/null +++ b/cs/research/darq/ExampleServices/splog/splog.proto @@ -0,0 +1,42 @@ +syntax = "proto3"; +option csharp_namespace = "protobuf"; + +message SplogAppendRequest { + bytes entry = 1; +} + +message SplogAppendResponse { + bool ok = 1; + int64 lsn = 2; +} + +message SplogScanRequest { + int64 startLsn = 1; + int64 endLsn = 2; + int32 maxChunkSize = 3; + int32 timeoutMilli = 4; +} + +message SplogEntry { + bytes entry = 1; + int64 lsn = 2; +} + +message SplogScanResponse { + repeated SplogEntry entries = 1; + int64 nextLsn = 2; +} + +message SplogTruncateRequest { + int64 newStartLsn = 1; +} + +message SplogTruncateResponse { + bool ok = 1; +} + +service SplogService { + rpc Append(SplogAppendRequest) returns (SplogAppendResponse); + rpc Scan(SplogScanRequest) returns (SplogScanResponse); + rpc Truncate(SplogTruncateRequest) returns (SplogTruncateResponse); +} diff --git a/cs/research/darq/ExampleServices/sppubsub/SpPubSubProcessorClient.cs b/cs/research/darq/ExampleServices/sppubsub/SpPubSubProcessorClient.cs new file mode 100644 index 000000000..d7521e531 --- /dev/null +++ b/cs/research/darq/ExampleServices/sppubsub/SpPubSubProcessorClient.cs @@ -0,0 +1,90 @@ +using FASTER.core; +using FASTER.libdpr; +using pubsub; + +namespace dse.services; + +public interface SpPubSubEventHandler +{ + // Invoked when there are no more entries and the handler is expected to await + ValueTask HandleAwait(); + + ValueTask HandleAsync(Event ev, CancellationToken token); + + void OnRestart(PubsubCapabilities capabilities); +} + +public class PubsubCapabilities +{ + internal SpPubSubServiceClient client; + internal DprSession session; + internal long incarnationId; + internal int topicId; + + public Task Step(pubsub.StepRequest request) + { + request.IncarnationId = incarnationId; + request.TopicId = topicId; + request.FireAndForget = true; + return client.StepAsync(request, session); + } +} + +public class SpPubSubProcessorClient +{ + private int topicId; + private SpPubSubServiceClient client; + private long incarnationId; + + public SpPubSubProcessorClient(int topicId, SpPubSubServiceClient client) + { + this.topicId = topicId; + this.client = client; + } + + public async Task StartProcessingAsync(SpPubSubEventHandler handler, bool speculative, + CancellationToken token = default) + { + incarnationId = await client.RegisterProcessor(topicId); + while (!token.IsCancellationRequested) + { + var session = speculative ? new DprSession() : null; + handler.OnRestart(new PubsubCapabilities + { + client = client, + // To ensure that step returns quickly, make the return speculative even if processing is not + session = session, + incarnationId = incarnationId, + topicId = topicId + }); + var stream = client.ReadEventsFromTopic(new ReadEventsRequest + { + Speculative = speculative, + TopicId = topicId + }, session, cancellationToken: token); + + try + { + while (true) + { + var task = stream.ResponseStream.MoveNext(token); + if (!task.IsCompleted) + { + await handler.HandleAwait(); + if (!await task) break; + } + await handler.HandleAsync(stream.ResponseStream.Current, token); + } + } + catch (DprSessionRolledBackException e) + { + // Just continue and restart the stream from where it's supposed to + continue; + } + catch (TaskCanceledException e) + { + break; + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/sppubsub/SpPubSubService.cs b/cs/research/darq/ExampleServices/sppubsub/SpPubSubService.cs new file mode 100644 index 000000000..a529270a5 --- /dev/null +++ b/cs/research/darq/ExampleServices/sppubsub/SpPubSubService.cs @@ -0,0 +1,404 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Text; +using darq.client; +using FASTER.client; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; +using Google.Protobuf; +using Grpc.Core; +using Microsoft.Extensions.Hosting; +using pubsub; +using DarqMessageType = FASTER.libdpr.DarqMessageType; +using DarqStepStatus = pubsub.DarqStepStatus; +using Event = pubsub.Event; +using RegisterProcessorRequest = pubsub.RegisterProcessorRequest; +using RegisterProcessorResult = pubsub.RegisterProcessorResult; +using Status = Grpc.Core.Status; +using StepRequest = pubsub.StepRequest; + +namespace dse.services; + +internal struct EventDataAdapter : ILogEnqueueEntry +{ + internal string data; + public int SerializedLength => sizeof(DarqMessageType) + data.Length; + + public unsafe void SerializeTo(Span dest) + { + fixed (byte* h = dest) + { + var head = h; + *(DarqMessageType*)head = DarqMessageType.IN; + head += sizeof(DarqMessageType); + Encoding.UTF8.GetBytes(data, new Span(head, data.Length)); + } + } +} + +public class PubsubDarqProducer : IDarqProducer +{ + private SpPubSubServiceClient client; + private DprSession session; + private SimpleObjectPool requestPool = new(() => new EnqueueRequest()); + private SimpleObjectPool>> callbackPool = new(() => new List>(32)); + private Dictionary>)> currentRequest = new(); + + public PubsubDarqProducer(Dictionary clusterMap, DprSession session) + { + client = new SpPubSubServiceClient(clusterMap); + this.session = session; + } + + public void Dispose() + { + } + + public void EnqueueMessageWithCallback(DarqId darqId, ReadOnlySpan message, Action callback, + long producerId, long lsn) + { + + if (!currentRequest.TryGetValue(darqId, out var entry)) + { + var request = requestPool.Checkout(); + request.TopicId = (int)darqId.guid; + request.ProducerId = producerId; + request.SequenceNum = 0; + request.Events.Clear(); + var callbacks = callbackPool.Checkout(); + callbacks.Clear(); + + entry = currentRequest[darqId] = (request, callbacks); + } + // Only expecting to call with a single producerId for now + Debug.Assert(entry.Item1.ProducerId == producerId); + // Only expecting to get monotonically increasing lsns for now + Debug.Assert(entry.Item1.SequenceNum < lsn); + entry.Item1.SequenceNum = lsn; + entry.Item1.Events.Add(Encoding.UTF8.GetString(message)); + entry.Item2.Add(callback); + } + + public void ForceFlush() + { + foreach (var entry in currentRequest.Values) + { + Task.Run(async () => + { + await client.EnqueueEventsAsync(entry.Item1, session); + foreach (var callback in entry.Item2) callback(true); + requestPool.Return(entry.Item1); + callbackPool.Return(entry.Item2); + }); + } + currentRequest.Clear(); + } +} + +public class SpPubSubServiceSettings +{ + public Dictionary clusterMap; + public Func factory; + public int hostId; +} + +public class SpPubSubBackendService : BackgroundService +{ + private SpPubSubServiceSettings settings; + private ConcurrentDictionary topics; + private DarqMaintenanceBackgroundService maintenanceService; + private StateObjectRefreshBackgroundService refreshService; + private TaskCompletionSource started; + + public SpPubSubBackendService(SpPubSubServiceSettings settings, + DarqMaintenanceBackgroundService maintenanceService, + StateObjectRefreshBackgroundService refreshService) + { + this.settings = settings; + topics = new ConcurrentDictionary(); + this.maintenanceService = maintenanceService; + this.refreshService = refreshService; + started = new TaskCompletionSource(); + } + + public async ValueTask GetTopic(int id) + { + await started.Task; + return topics[id]; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + foreach (var entry in settings.clusterMap) + { + if (entry.Value.Item1 != settings.hostId) continue; + var result = settings.factory(entry.Key, new DprWorkerId(entry.Key)); + result.ConnectToCluster(out _); + maintenanceService.RegisterMaintenanceTask(result, new DarqMaintenanceBackgroundServiceSettings + { + morselSize = 512, + batchSize = 64, + producerFactory = session => new PubsubDarqProducer(settings.clusterMap, session), + speculative = true + }); + refreshService.RegisterRefreshTask(result); + topics[entry.Key] = result; + } + + started.SetResult(); + await Task.Delay(Timeout.InfiniteTimeSpan, stoppingToken); + } +} + +public class SpPubSubService : SpPubSub.SpPubSubBase +{ + private SimpleObjectPool stepRequestPool; + private SpPubSubBackendService backend; + + + public SpPubSubService(SpPubSubBackendService backend) + { + this.backend = backend; + stepRequestPool = new SimpleObjectPool(() => new FASTER.libdpr.StepRequest()); + } + + // private async Task GetOrCreateTopic(int topicId) + // { + // if (topics.TryGetValue(topicId, out var entry)) return entry; + // // Otherwise, check if topic has been created in manager + // var queryResult = await consul.KV.Get("topic-" + topicId); + // if (queryResult.Response == null) + // throw new RpcException(new Status(StatusCode.NotFound, "requested topic does not exist")); + // + // var metadataEntry = JObject.Parse(Encoding.UTF8.GetString(queryResult.Response.Value)); + // + // if (((string)metadataEntry["hostId"])!.Equals(hostId)) + // throw new RpcException(new Status(StatusCode.NotFound, "requested topic is not assigned to this host")); + // + // lock (this) + // { + // if (topics.TryGetValue(topicId, out entry)) return entry; + // var dprWorkerId = new DprWorkerId((long)metadataEntry["dprWorkerId"]!); + // var result = factory(topicId, dprWorkerId); + // result.ConnectToCluster(out _); + // maintenanceService.RegisterMaintenanceTask(result, new DarqMaintenanceBackgroundServiceSettings + // { + // morselSize = 512, + // batchSize = 16, + // producerFactory = session => new PubsubDarqProducer(session, new ConsulClient(settings.consulConfig)) + // }); + // refreshService.RegisterRefreshTask(result); + // topics[topicId] = result; + // return result; + // } + // } + + private ByteString PopulateHeaderAndEndAction(Darq topic) + { + unsafe + { + var dprHeaderBytes = stackalloc byte[DprMessageHeader.FixedLenSize]; + topic.ProduceTagAndEndAction(new Span(dprHeaderBytes, DprMessageHeader.FixedLenSize)); + return ByteString.CopyFrom(new Span(dprHeaderBytes, DprMessageHeader.FixedLenSize)); + } + } + + public override async Task EnqueueEvents(EnqueueRequest request, ServerCallContext context) + { + // TODO(Tianyu): Create Epoch Context + var topic = await backend.GetTopic(request.TopicId); + if (!request.DprHeader.IsEmpty) + { + // Speculative code path + if (!await topic.TryReceiveAndStartActionAsync(request.DprHeader)) + // Use an error to signal to caller that this call cannot proceed + // TODO(Tianyu): add more descriptive exception information + throw new RpcException(Status.DefaultCancelled); + var result = new EnqueueResult + { + Ok = topic.Enqueue(request.Events.Select(e => new EventDataAdapter { data = e }), + request.ProducerId, request.SequenceNum) + }; + result.DprHeader = PopulateHeaderAndEndAction(topic); + + return result; + } + else + { + topic.StartLocalAction(); + var result = new EnqueueResult + { + Ok = topic.Enqueue(request.Events.Select(e => new EventDataAdapter { data = e }), + request.ProducerId, request.SequenceNum) + }; + topic.EndAction(); + if (!request.FireAndForget) + await topic.NextCommit(); + return result; + } + } + + public override async Task Step(StepRequest request, ServerCallContext context) + { + var topic = await backend.GetTopic(request.TopicId); + // TODO(Tianyu): Pick the appropriate context + LightEpoch.EpochContext epochContext = null; + + var requestObject = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(requestObject); + foreach (var consumed in request.ConsumedMessageOffsets) + requestBuilder.MarkMessageConsumed(consumed); + foreach (var self in request.RecoveryMessages) + requestBuilder.AddRecoveryMessage(self.Span); + foreach (var outBatch in request.OutMessages) + { + if (outBatch.TopicId == request.TopicId) + requestBuilder.AddSelfMessage(outBatch.Event); + else + requestBuilder.AddOutMessage(new DarqId(outBatch.TopicId), outBatch.Event); + } + + if (!request.DprHeader.IsEmpty) + { + // Speculative code path + if (!await topic.TryReceiveAndStartActionAsync(request.DprHeader, epochContext)) + // Use an error to signal to caller that this call cannot proceed + // TODO(Tianyu): add more descriptive exception information + throw new RpcException(Status.DefaultCancelled); + var status = topic.Step(request.IncarnationId, requestBuilder.FinishStep()); + var result = new StepResult + { + Status = status switch + { + // Should never happen + StepStatus.INCOMPLETE => throw new NotImplementedException(), + StepStatus.SUCCESS => DarqStepStatus.Success, + StepStatus.INVALID => DarqStepStatus.Invalid, + StepStatus.REINCARNATED => DarqStepStatus.Reincarnated, + _ => throw new ArgumentOutOfRangeException() + } + }; + result.DprHeader = PopulateHeaderAndEndAction(topic); + stepRequestPool.Return(requestObject); + return result; + } + else + { + topic.StartLocalAction(epochContext); + var status = topic.Step(request.IncarnationId, requestBuilder.FinishStep()); + var result = new StepResult + { + Status = status switch + { + // Should never happen + StepStatus.INCOMPLETE => throw new NotImplementedException(), + StepStatus.SUCCESS => DarqStepStatus.Success, + StepStatus.INVALID => DarqStepStatus.Invalid, + StepStatus.REINCARNATED => DarqStepStatus.Reincarnated, + _ => throw new ArgumentOutOfRangeException() + } + }; + topic.EndAction(epochContext); + stepRequestPool.Return(requestObject); + if (!request.FireAndForget) + await topic.NextCommit(); + return result; + } + } + + + private unsafe bool TryReadOneEntry(Darq topic, long worldLine, DarqScanIterator scanner, + LightEpoch.EpochContext context, out Event ev) + { + ev = default; + var dprHeaderBytes = stackalloc byte[DprMessageHeader.FixedLenSize]; + try + { + topic.StartLocalAction(context); + if (topic.WorldLine() != worldLine) + throw new DprSessionRolledBackException(topic.WorldLine()); + + if (!scanner.UnsafeGetNext(out var b, out var length, out var offset, out var nextOffset, out var type)) + return false; + + if (type is not (DarqMessageType.IN or DarqMessageType.RECOVERY)) + { + scanner.UnsafeRelease(); + return false; + } + + ev = new Event + { + Type = type switch + { + DarqMessageType.IN => pubsub.DarqMessageType.In, + DarqMessageType.RECOVERY => pubsub.DarqMessageType.Recovery, + _ => throw new ArgumentOutOfRangeException() + }, + Data = Encoding.UTF8.GetString(b, length), + Offset = offset, + NextOffset = nextOffset + }; + scanner.UnsafeRelease(); + return true; + } + finally + { + topic.ProduceTagAndEndAction(new Span(dprHeaderBytes, DprMessageHeader.FixedLenSize), context); + if (ev != default) + ev.DprHeader = ByteString.CopyFrom(new Span(dprHeaderBytes, DprMessageHeader.FixedLenSize)); + } + } + + public override async Task ReadEventsFromTopic(ReadEventsRequest request, IServerStreamWriter responseStream, + ServerCallContext context) + { + var topic = await backend.GetTopic(request.TopicId); + var worldLine = topic.WorldLine(); + var scanner = topic.StartScan(request.Speculative); + + // TODO(Tianyu): Pick the appropriate context + LightEpoch.EpochContext epochContext = null; + long lastCommitted = 0; + + while (!context.CancellationToken.IsCancellationRequested) + { + if (TryReadOneEntry(topic, worldLine, scanner, epochContext, out var ev)) + { + if (!request.Speculative && ev.NextOffset >= lastCommitted) + { + // Avoid repeatedly wait for the newest commit + lastCommitted = topic.Tail; + await topic.NextCommit(); + } + await responseStream.WriteAsync(ev); + } + else + { + await scanner.WaitAsync(context.CancellationToken); + } + } + } + + public override async Task RegisterProcessor(RegisterProcessorRequest request, + ServerCallContext context) + { + var topic = await backend.GetTopic(request.TopicId); + var result = await topic.RegisterNewProcessorAsync(); + return new RegisterProcessorResult + { + IncarnationId = result + }; + } + + public override async Task GetNumBytesWritten(GetNumBytesWrittenRequest request, ServerCallContext context) + { + var topic = await backend.GetTopic(request.TopicId); + return new GetNumBytesWrittenResult + { + NumBytes = topic.BytesWritten + }; + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/sppubsub/SpPubSubServiceClient.cs b/cs/research/darq/ExampleServices/sppubsub/SpPubSubServiceClient.cs new file mode 100644 index 000000000..9ac8ba0f7 --- /dev/null +++ b/cs/research/darq/ExampleServices/sppubsub/SpPubSubServiceClient.cs @@ -0,0 +1,175 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using FASTER.common; +using FASTER.libdpr; +using Google.Protobuf; +using Grpc.Core; +using Grpc.Core.Interceptors; +using Grpc.Net.Client; +using pubsub; +using StepRequest = pubsub.StepRequest; + +namespace dse.services; + +public class SpPubSubServiceClient +{ + private Dictionary clusterMap; + private ConcurrentDictionary openConnections = new(); + private SimpleObjectPool serializationBufferPool = new(() => new byte[1 << 20]); + + public SpPubSubServiceClient(Dictionary clusterMap) + { + this.clusterMap = clusterMap; + } + + private ValueTask GetOrCreateConnection(int topicId) + { + if (openConnections.TryGetValue(topicId, out var result)) return ValueTask.FromResult(result); + return ValueTask.FromResult(openConnections[topicId] = GrpcChannel.ForAddress(clusterMap[topicId].Item2)); + // var queryResult = await consul.KV.Get("topic-" + topicId); + // if (queryResult.Response == null) + // throw new NotImplementedException("Topic does not exist"); + // + // var metadataEntry = JObject.Parse(Encoding.UTF8.GetString(queryResult.Response.Value)); + // return openConnections[topicId] = GrpcChannel.ForAddress((string) metadataEntry["hostAddress"]); + } + + // public async Task CreateTopic(int topicId, string hostId, string hostAddress, DprWorkerId id) + // { + // var jsonEntry = JsonConvert.SerializeObject(new + // { hostId = hostId, hostAddress = hostAddress, dprWorkerId = id.guid }); + // return (await consul.KV.CAS(new KVPair("topic-" + topicId) + // { + // Value = Encoding.UTF8.GetBytes(jsonEntry) + // })).Response; + // } + + public async Task EnqueueEventsAsync(EnqueueRequest request, DprSession session = null) + { + var channel = await GetOrCreateConnection(request.TopicId); + if (session != null) + { + var buf = serializationBufferPool.Checkout(); + var size = session.TagMessage(buf); + request.DprHeader = ByteString.CopyFrom(new Span(buf, 0, size)); + serializationBufferPool.Return(buf); + } + + var client = new SpPubSub.SpPubSubClient(channel); + var result = await client.EnqueueEventsAsync(request); + if (session == null || session.Receive(result.DprHeader.Span)) + return result; + throw new TaskCanceledException(); + } + + public async Task RegisterProcessor(int topicId) + { + var channel = await GetOrCreateConnection(topicId); + var client = new SpPubSub.SpPubSubClient(channel); + var result = await client.RegisterProcessorAsync(new RegisterProcessorRequest + { + TopicId = topicId + }); + return result.IncarnationId; + } + + public async Task GetNumBytesWritten(int topicId) + { + var channel = await GetOrCreateConnection(topicId); + var client = new SpPubSub.SpPubSubClient(channel); + return await client.GetNumBytesWrittenAsync(new GetNumBytesWrittenRequest + { + TopicId = topicId + }); + } + + public async Task StepAsync(StepRequest request, DprSession session = null) + { + var channel = await GetOrCreateConnection(request.TopicId); + if (session != null) + { + var buf = serializationBufferPool.Checkout(); + var size = session.TagMessage(buf); + request.DprHeader = ByteString.CopyFrom(new Span(buf, 0, size)); + serializationBufferPool.Return(buf); + } + + var client = new SpPubSub.SpPubSubClient(channel); + var result = await client.StepAsync(request); + if (session == null || session.Receive(result.DprHeader.Span)) + return result.Status; + throw new TaskCanceledException(); + } + + private class StreamingCallInterceptor : Interceptor + { + private DprSession session; + + public StreamingCallInterceptor(DprSession session) + { + this.session = session; + } + + public override AsyncServerStreamingCall AsyncServerStreamingCall( + TRequest request, + ClientInterceptorContext context, + AsyncServerStreamingCallContinuation continuation) + { + var originalCall = continuation(request, context); + + var responseStream = new DprHandlingStream(originalCall.ResponseStream, session); + + // Return a new AsyncServerStreamingCall with our custom response stream + return new AsyncServerStreamingCall( + responseStream, + originalCall.ResponseHeadersAsync, + originalCall.GetStatus, + originalCall.GetTrailers, + originalCall.Dispose); + } + } + + public class DprHandlingStream : IAsyncStreamReader + { + private readonly IAsyncStreamReader inner; + private readonly DprSession session; + + public DprHandlingStream(IAsyncStreamReader inner, DprSession session) + { + this.inner = inner; + this.session = session; + } + + public T Current => inner.Current; + + public async Task MoveNext(CancellationToken cancellationToken) + { + var hasNext = await inner.MoveNext(cancellationToken); + if (!hasNext) return false; + + var batch = inner.Current as pubsub.Event; + Debug.Assert(batch != null); + if (!session.Receive(batch.DprHeader.Span)) + throw new TaskCanceledException(); + return true; + } + } + + public AsyncServerStreamingCall ReadEventsFromTopic(ReadEventsRequest request, + DprSession session = null, DateTime? deadline = null, CancellationToken cancellationToken = default) + { + var channel = GetOrCreateConnection(request.TopicId).GetAwaiter().GetResult(); + if (session == null) + { + var client = new SpPubSub.SpPubSubClient(channel); + request.Speculative = false; + return client.ReadEventsFromTopic(request, null, deadline, cancellationToken); + } + else + { + var client = new SpPubSub.SpPubSubClient(channel.Intercept(new StreamingCallInterceptor(session))); + request.Speculative = true; + return client.ReadEventsFromTopic(request, null, deadline, cancellationToken); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/sppubsub/sppubsub.proto b/cs/research/darq/ExampleServices/sppubsub/sppubsub.proto new file mode 100644 index 000000000..a659a5e8e --- /dev/null +++ b/cs/research/darq/ExampleServices/sppubsub/sppubsub.proto @@ -0,0 +1,88 @@ +syntax = "proto3"; +option csharp_namespace = "pubsub"; + +message RegisterProcessorRequest { + int32 topicId = 1; +} + +message RegisterProcessorResult { + int64 incarnationId = 1; +} + +enum DarqMessageType { + IN = 0; + RECOVERY = 1; +} + +message OutMessage { + int32 topicId = 1; + string event = 2; +} + +message Event { + bytes dprHeader = 1; + DarqMessageType type = 2; + string data = 3; + int64 offset = 5; + int64 nextOffset = 6; +} + +message StepRequest { + bytes dprHeader = 1; + int64 incarnationId = 2; + int32 topicId = 3; + repeated int64 consumedMessageOffsets = 4; + repeated OutMessage outMessages = 5; + repeated bytes recoveryMessages = 6; + bool fireAndForget = 7; +} + +enum DarqStepStatus { + SUCCESS = 0; + INVALID = 1; + REINCARNATED = 2; +} + +message StepResult { + bytes dprHeader = 1; + DarqStepStatus status = 2; +} + +message EnqueueRequest { + bytes dprHeader = 1; + int64 producerId = 2; + int64 sequenceNum = 3; + int32 topicId = 4; + repeated string events = 5; + bool fireAndForget = 6; +} + +message EnqueueResult { + bytes dprHeader = 1; + bool ok = 2; +} + +message ReadEventsRequest { + bool speculative = 1; + int32 topicId = 2; +} + +message GetNumBytesWrittenRequest{ + int32 topicId = 1; +} + +message GetNumBytesWrittenResult { + int64 numBytes = 1; +} + +service SpPubSub { + rpc EnqueueEvents(EnqueueRequest) returns (EnqueueResult); + + rpc ReadEventsFromTopic(ReadEventsRequest) returns (stream Event); + + rpc RegisterProcessor(RegisterProcessorRequest) returns (RegisterProcessorResult); + + rpc Step(StepRequest) returns (StepResult); + + rpc GetNumBytesWritten(GetNumBytesWrittenRequest) returns (GetNumBytesWrittenResult); +} diff --git a/cs/research/darq/ExampleServices/spworkflow/IWorkflowStateMachine.cs b/cs/research/darq/ExampleServices/spworkflow/IWorkflowStateMachine.cs new file mode 100644 index 000000000..8aaced0b2 --- /dev/null +++ b/cs/research/darq/ExampleServices/spworkflow/IWorkflowStateMachine.cs @@ -0,0 +1,13 @@ +using FASTER.libdpr; + +namespace dse.services; + + +public interface IWorkflowStateMachine +{ + public void ProcessMessage(DarqMessage m); + + public void OnRestart(IDarqProcessorClientCapabilities capabilities, StateObject stateObject); + + Task GetResult(CancellationToken token); +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/spworkflow/OrchestratorBackgroundService.cs b/cs/research/darq/ExampleServices/spworkflow/OrchestratorBackgroundService.cs new file mode 100644 index 000000000..00eb09941 --- /dev/null +++ b/cs/research/darq/ExampleServices/spworkflow/OrchestratorBackgroundService.cs @@ -0,0 +1,117 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; +using Google.Protobuf; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; + +namespace dse.services; + +public class OrchestartorBackgroundProcessingServiceSettings +{ + public Dictionary workflowFactories; + public bool speculative; +} + +public class OrchestratorBackgroundProcessingService : BackgroundService, IDarqProcessor +{ + private Darq backend; + private ColocatedDarqProcessorClient processorClient; + private Dictionary workflowFactories; + + private ConcurrentDictionary startedWorkflows = new(); + private IDarqProcessorClientCapabilities capabilities; + private SimpleObjectPool stepRequestPool = new(() => new StepRequest()); + private ILogger logger; + private CancellationTokenSource cts; + + public delegate IWorkflowStateMachine WorkflowFactory(ReadOnlySpan input, ILogger logger); + + + public OrchestratorBackgroundProcessingService(Darq darq, OrchestartorBackgroundProcessingServiceSettings settings, ILogger logger) + { + backend = darq; + processorClient = new ColocatedDarqProcessorClient(backend, settings.speculative); + workflowFactories = settings.workflowFactories; + this.logger = logger; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + backend.ConnectToCluster(out _); + await processorClient.StartProcessingAsync(this, stoppingToken); + processorClient.Dispose(); + } + + public async Task CreateWorkflow(ExecuteWorkflowRequest request) + { + var workflowHandler = workflowFactories[request.WorkflowClassId](request.Input.Span, logger); + workflowHandler.OnRestart(capabilities, backend); + var actualHandler = startedWorkflows.GetOrAdd(request.WorkflowId, workflowHandler); + if (actualHandler == workflowHandler) + { + // This handle was created by this thread, which gives us the ability to go ahead and start the workflow + var stepRequest = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(stepRequest); + requestBuilder.AddRecoveryMessage(-request.WorkflowId, request.ToByteArray()); + // Start the workflow by giving it an initial message + requestBuilder.AddSelfMessage(request.WorkflowId, Span.Empty); + var success = backend.Enqueue(requestBuilder.FinishStep(), -1, 0); + Debug.Assert(success); + // logger.LogInformation($"Workflow {request.WorkflowId} started"); + stepRequestPool.Return(stepRequest); + } + return await GetWorkflowResultAsync(request.WorkflowId, actualHandler); + } + + private async Task GetWorkflowResultAsync(long workflowId, IWorkflowStateMachine workflow) + { + while (true) + { + var s = backend.DetachFromWorkerAndPauseAction(); + try + { + var result = await workflow.GetResult(cts.Token); + if (await backend.TryMergeAndStartActionAsync(s)) return result; + } + catch (TaskCanceledException) + { + } + + // Otherwise, there has been a rollback, should retry with a new handle, if any + while (!startedWorkflows.TryGetValue(workflowId, out workflow)) + await Task.Yield(); + backend.StartLocalAction(); + } + } + + public bool ProcessMessage(DarqMessage m) + { + var workflowId = BitConverter.ToInt64(m.GetMessageBody()); + if (workflowId < 0) + { + logger.LogWarning($"Replaying Workflow creation for id {-workflowId}"); + Debug.Assert(m.GetMessageType() == DarqMessageType.RECOVERY); + var request = ExecuteWorkflowRequest.Parser.ParseFrom(m.GetMessageBody()); + var workflow = workflowFactories[request.WorkflowClassId](request.Input.Span, logger); + workflow.OnRestart(capabilities, backend); + var ok = startedWorkflows.TryAdd(-workflowId, workflow); + Debug.Assert(ok); + return true; + } + + startedWorkflows[workflowId].ProcessMessage(m); + return true; + } + + public void OnRestart(IDarqProcessorClientCapabilities capabilities) + { + logger.LogWarning($"Workflow processor restarted"); + cts?.Cancel(); + startedWorkflows = new ConcurrentDictionary(); + this.capabilities = capabilities; + cts = new CancellationTokenSource(); + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/spworkflow/WorkflowOrchestratorService.cs b/cs/research/darq/ExampleServices/spworkflow/WorkflowOrchestratorService.cs new file mode 100644 index 000000000..1198374af --- /dev/null +++ b/cs/research/darq/ExampleServices/spworkflow/WorkflowOrchestratorService.cs @@ -0,0 +1,24 @@ +using System.Text; +using Grpc.Core; +using Microsoft.Extensions.Logging; + +namespace dse.services; + +public class WorkflowOrchestratorService : WorkflowOrchestrator.WorkflowOrchestratorBase +{ + private OrchestratorBackgroundProcessingService backend; + private ILogger logger; + public WorkflowOrchestratorService(OrchestratorBackgroundProcessingService backend, ILogger logger) + { + this.backend = backend; + this.logger = logger; + } + + public override Task ExecuteWorkflow(ExecuteWorkflowRequest request, + ServerCallContext context) + { + // logger.LogInformation( + // $"Received execute workflow request, id of {request.WorkflowId}, class id of {request.WorkflowClassId}, request string of {Encoding.UTF8.GetString(request.Input.Span)}"); + return backend.CreateWorkflow(request); + } +} \ No newline at end of file diff --git a/cs/research/darq/ExampleServices/spworkflow/workflow.proto b/cs/research/darq/ExampleServices/spworkflow/workflow.proto new file mode 100644 index 000000000..9e07853a8 --- /dev/null +++ b/cs/research/darq/ExampleServices/spworkflow/workflow.proto @@ -0,0 +1,16 @@ +syntax = "proto3"; + +service WorkflowOrchestrator { + rpc ExecuteWorkflow(ExecuteWorkflowRequest) returns (ExecuteWorkflowResult); +} + +message ExecuteWorkflowRequest { + int64 workflowId = 1; + int32 workflowClassId = 2; + bytes input = 3; +} + +message ExecuteWorkflowResult { + bool ok = 1; + bytes result = 2; +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq.test/FASTER.darq.test.csproj b/cs/research/darq/FASTER.darq.test/FASTER.darq.test.csproj new file mode 100644 index 000000000..ca8586963 --- /dev/null +++ b/cs/research/darq/FASTER.darq.test/FASTER.darq.test.csproj @@ -0,0 +1,19 @@ + + + + net7.0 + enable + enable + + false + + + + + + + + + + + diff --git a/cs/research/darq/FASTER.darq.test/UnitTest1.cs b/cs/research/darq/FASTER.darq.test/UnitTest1.cs new file mode 100644 index 000000000..11c7bf287 --- /dev/null +++ b/cs/research/darq/FASTER.darq.test/UnitTest1.cs @@ -0,0 +1,15 @@ +namespace FASTER.darq.test; + +public class Tests +{ + [SetUp] + public void Setup() + { + } + + [Test] + public void Test1() + { + Assert.Pass(); + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq.test/Usings.cs b/cs/research/darq/FASTER.darq.test/Usings.cs new file mode 100644 index 000000000..cefced496 --- /dev/null +++ b/cs/research/darq/FASTER.darq.test/Usings.cs @@ -0,0 +1 @@ +global using NUnit.Framework; \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/Darq.cs b/cs/research/darq/FASTER.darq/Darq.cs new file mode 100644 index 000000000..81c1a7148 --- /dev/null +++ b/cs/research/darq/FASTER.darq/Darq.cs @@ -0,0 +1,402 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using FASTER.common; +using FASTER.core; +using FASTER.libdpr; + +namespace FASTER.darq +{ + /// + /// Status of a step + /// + public enum StepStatus + { + /// + /// Step is not yet completed + /// + INCOMPLETE, + + /// + /// Step is successfully completed + /// + SUCCESS, + + /// + /// Step cannot be completed because it is either ill-formed or because it is trying to consume + /// consumed messages + /// + INVALID, + + /// + /// The step cannot be completed because it originated from a processor that is no longer allowed to + /// update DARQ state (possibly due to another, newer processor taking over) + /// + REINCARNATED + } + + internal class StepRequestHandle + { + internal volatile StepStatus status; + internal long incarnation; + internal IReadOnlySpanBatch stepMessages; + internal ManualResetEventSlim done = new(); + + internal void Reset(long incarnation, IReadOnlySpanBatch stepMessages) + { + this.incarnation = incarnation; + status = StepStatus.INCOMPLETE; + this.stepMessages = stepMessages; + done.Reset(); + } + } + + public class LongValueAttachment : IStateObjectAttachment + { + public long value; + + public int SerializedSize() => sizeof(long); + + public void SerializeTo(Span buffer) + { + BitConverter.TryWriteBytes(buffer, value); + } + + public void RecoverFrom(ReadOnlySpan serialized) + { + unsafe + { + fixed (byte* b = serialized) + value = *(long*)b; + } + } + } + + /// + /// DARQ data structure + /// + public class Darq : StateObject, IDisposable + { + internal DarqSettings settings; + internal FasterLog log; + internal ConcurrentDictionary incompleteMessages = new(); + private FasterLogSettings logSetting; + + private readonly DeduplicationVector dvc; + private readonly LongValueAttachment incarnation, largestSteppedLsn; + private WorkQueueLIFO stepQueue; + private ThreadLocalObjectPool stepRequestPool; + + public long BytesWritten => bytesWritten; + private long bytesWritten = 0, lastCommitted = 0; + + /// + /// Initialize DARQ with the given identity and parameters + /// + /// unique identity for this DARQ + /// parameters for DARQ + public Darq(DarqSettings settings, IVersionScheme versionScheme) : base(versionScheme, new DprWorkerOptions + { + Me = settings.MyDpr == DprWorkerId.INVALID ? new DprWorkerId(settings.Me.guid) : settings.MyDpr, + DprFinder = settings.DprFinder, + CheckpointPeriodMilli = settings.CheckpointPeriodMilli, + RefreshPeriodMilli = settings.RefreshPeriodMilli + }) + { + this.settings = settings; + if (settings.LogDevice == null) + throw new FasterException("Cannot initialize DARQ as no underlying device is specified. " + + "Please supply DARQ with a device under DarqSettings.LogDevice"); + + if (settings.LogCommitManager == null) + { + settings.LogCommitManager = new DeviceLogCommitCheckpointManager + (new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme( + settings.LogCommitDir ?? + new FileInfo(settings.LogDevice.FileName).Directory.FullName)); + } + + if (this.settings.CleanStart) + { + settings.LogCommitManager.RemoveAllCommits(); + } + + logSetting = new FasterLogSettings + { + LogDevice = settings.LogDevice, + PageSize = settings.PageSize, + MemorySize = settings.MemorySize, + SegmentSize = settings.SegmentSize, + LogCommitManager = settings.LogCommitManager, + LogCommitDir = settings.LogCommitDir, + GetMemory = _ => + throw new FasterException( + "DARQ should never do anything through a code path that needs to materialize into external mem buffer"), + LogChecksum = settings.LogChecksum, + MutableFraction = settings.MutableFraction, + ReadOnlyMode = false, + FastCommitMode = settings.FastCommitMode, + RemoveOutdatedCommits = false, + LogCommitPolicy = null, + TryRecoverLatest = false, + AutoRefreshSafeTailAddress = true, + AutoCommit = false, + TolerateDeviceFailure = false, + }; + + log = new FasterLog(logSetting); + dvc = new DeduplicationVector(); + incarnation = new LongValueAttachment(); + largestSteppedLsn = new LongValueAttachment(); + AddAttachment(dvc); + AddAttachment(incarnation); + AddAttachment(largestSteppedLsn); + + stepQueue = new WorkQueueLIFO(StepSequential); + stepRequestPool = new ThreadLocalObjectPool(() => new StepRequestHandle()); + } + + /// + /// Return the tail address that this DARQ will need to replay to upon failure recovery + /// + public long ReplayEnd => largestSteppedLsn.value; + + public long Tail => log.TailAddress; + + /// + public override void Dispose() + { + if (settings.DeleteOnClose) + settings.LogCommitManager.RemoveAllCommits(); + log.Dispose(); + settings.LogDevice.Dispose(); + settings.LogCommitManager.Dispose(); + } + + private void EnqueueCallbackBatch(IReadOnlySpanBatch m, int idx, long addr) + { + incompleteMessages.TryAdd(addr, 0); + } + + private void EnqueueCallback(T entry, long addr) where T : ILogEnqueueEntry + { + incompleteMessages.TryAdd(addr, 0); + } + + + /// + /// Enqueue given entries into DARQ, optionally deduplicated using the supplied producer ID and sequence number. + /// + /// + /// Entries to enqueue. must already be well-formed on a byte level with message types, etc. + /// + /// Unique id of the producer for deduplication, or -1 if not required + /// + /// sequence number for deduplication. DARQ will only accept enqueue requests with monotonically increasing + /// sequence numbers from the same producer + /// + /// whether enqueue is successful + public bool Enqueue(IReadOnlySpanBatch entries, long producerId, long sequenceNum) + { + // Check that we are not executing duplicates and update dvc accordingly + if (producerId != -1 && !dvc.Process(producerId, sequenceNum)) + return false; + + log.Enqueue(entries, EnqueueCallbackBatch); + return true; + } + + public bool Enqueue(IEnumerable entries, long producerId, long sequenceNum) where T : ILogEnqueueEntry + { + // Check that we are not executing duplicates and update dvc accordingly + if (producerId != -1 && !dvc.Process(producerId, sequenceNum)) + return false; + foreach (var e in entries) + log.Enqueue(e, EnqueueCallback); + return true; + } + + private void StepCallback(IReadOnlySpanBatch ms, int idx, long addr) + { + var entry = ms.Get(idx); + // Get first byte for type + if ((DarqMessageType)entry[0] == DarqMessageType.RECOVERY || + (DarqMessageType)entry[0] == DarqMessageType.IN) + incompleteMessages.TryAdd(addr, 0); + + largestSteppedLsn.value = addr; + } + + private unsafe void StepSequential(StepRequestHandle stepRequestHandle) + { + // Maintain incarnation number + if (stepRequestHandle.incarnation != incarnation.value) + { + stepRequestHandle.status = StepStatus.REINCARNATED; + stepRequestHandle.done.Set(); + return; + } + + Debug.Assert(incarnation.value == stepRequestHandle.incarnation); + + // Validation of input batch + var numTotalEntries = stepRequestHandle.stepMessages.TotalEntries(); + // Validate if The last entry of the step is a completion record that steps some previous message + var lastEntry = stepRequestHandle.stepMessages.Get(numTotalEntries - 1); + fixed (byte* h = lastEntry) + { + var end = h + lastEntry.Length; + var messageType = (DarqMessageType)(*h); + if (messageType == DarqMessageType.COMPLETION) + { + Debug.Assert(lastEntry.Length % sizeof(long) == 1); + for (var head = h + sizeof(DarqMessageType); head < end; head += sizeof(long)) + { + var completedLsn = *(long*)head; + if (!incompleteMessages.TryRemove(completedLsn, out _)) + { + // This means we are trying to step something twice. Roll back all previous steps before + // failing this step + for (var rollbackHead = h + sizeof(DarqMessageType); + rollbackHead < head; + rollbackHead += sizeof(long)) + incompleteMessages.TryAdd(*(long*)rollbackHead, 0); + stepRequestHandle.status = StepStatus.INVALID; + stepRequestHandle.done.Set(); + Console.WriteLine($"step failed on lsn {completedLsn}"); + return; + } + } + } + } + log.Enqueue(stepRequestHandle.stepMessages, StepCallback); + stepRequestHandle.done.Set(); + stepRequestHandle.status = StepStatus.SUCCESS; + } + + /// + /// Step the DARQ with given incarnation number and step content + /// + /// incarnation number of the originating processor + /// + /// Step content. must already be well-formed on a byte level with message + /// types, etc. with the last entry being a completion record + /// + /// step result + public StepStatus Step(long incarnation, IReadOnlySpanBatch stepMessages) + { + var request = stepRequestPool.Checkout(); + request.Reset(incarnation, stepMessages); + stepQueue.EnqueueAndTryWork(request, false); + while (request.status == StepStatus.INCOMPLETE) + request.done.Wait(); + var result = request.status; + stepRequestPool.Return(request); + return result; + } + + /// + /// Truncate DARQ until the given lsn + /// + /// truncation point + public void TruncateUntil(long lsn) + { + log.TruncateUntil(lsn); + } + + /// + /// Registers a new processor the submit steps to this DARQ. + /// + /// the unique incarnation number assigned to this processor + public long RegisterNewProcessor() + { + return RegisterNewProcessorAsync().GetAwaiter().GetResult(); + } + + public Task RegisterNewProcessorAsync() + { + var result = Interlocked.Increment(ref incarnation.value); + // TODO(Tianyu): Must use some sort of epoch to force all threads to synchronize and recognize the new + // incarnation? On the other hand, maybe the sequential processing of steps is sufficient guarantee... + // ForceCheckpoint(spin: true); + return Task.FromResult(result); + } + + /// + /// Scans the DARQ with an iterator + /// + /// + public DarqScanIterator StartScan(bool speculative) => new(log, largestSteppedLsn.value, speculative); + + public DarqScanIterator StartBackgroundScan(bool speculative) => new(log, 0, speculative, false); + + + public override void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + var commitCookie = metadata.ToArray(); + log.CommitStrongly(out var tail, out _, false, commitCookie, version, onPersist); + bytesWritten += tail - Math.Max(lastCommitted, log.BeginAddress); + lastCommitted = tail; + } + + public override void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + Console.WriteLine($"Restoring checkpoint {version}"); + incompleteMessages.Clear(); + + // TODO(Tianyu): can presumably be more efficient through some type of in-mem truncation here + log = new FasterLog(logSetting); + log.Recover(version); + metadata = log.RecoveredCookie; + + Console.WriteLine($"Log recovered, now restoring in-memory DARQ data structures"); + // Scan the log on recovery to repopulate in-memory auxiliary data structures + unsafe + { + using var it = log.Scan(0, long.MaxValue); + while (it.UnsafeGetNext(out byte* entry, out var len, out var lsn, out _)) + { + switch ((DarqMessageType)(*entry)) + { + case DarqMessageType.IN: + case DarqMessageType.RECOVERY: + incompleteMessages.TryAdd(lsn, 0); + break; + case DarqMessageType.COMPLETION: + var completed = (long*)(entry + sizeof(DarqMessageType)); + while (completed < entry + len) + incompleteMessages.TryRemove(*completed++, out _); + break; + case DarqMessageType.OUT: + break; + default: + throw new NotImplementedException(); + } + + it.UnsafeRelease(); + } + } + + Console.WriteLine($"Recovery Finished"); + } + + public override void PruneVersion(long version) + { + settings.LogCommitManager.RemoveCommit(version); + } + + public override IEnumerable> GetUnprunedVersions() + { + var commits = settings.LogCommitManager.ListCommits().ToList(); + return commits.Select(commitNum => + { + // TODO(Tianyu): hacky + var newLog = new FasterLog(logSetting); + newLog.Recover(commitNum); + var commitCookie = newLog.RecoveredCookie; + newLog.Dispose(); + return new Memory(commitCookie); + }); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqBackgroundService.cs b/cs/research/darq/FASTER.darq/DarqBackgroundService.cs new file mode 100644 index 000000000..3a99a9527 --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqBackgroundService.cs @@ -0,0 +1,260 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using darq.client; +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; + +namespace FASTER.client +{ + public class DarqMaintenanceBackgroundServiceSettings + { + // Processing chunk size before the task yields + public int morselSize = 512; + + // batch size for background sends + public int batchSize = 16; + + public bool speculative = false; + + public Func producerFactory; + } + + public class DarqBackgroundMaintenanceTask : IDisposable + { + private Darq darq; + private DarqMaintenanceBackgroundServiceSettings settings; + private DprSession session; + + private DarqScanIterator iterator; + private DarqCompletionTracker completionTracker; + private long processedUpTo; + + private IDarqProducer currentProducerClient; + private int numBatched = 0; + + private SimpleObjectPool messagePool; + private ILogger logger; + + /// + /// Constructs a new ColocatedDarqProcessorClient + /// + /// DARQ DprServer that this consumer attaches to + /// information about the DARQ cluster + public DarqBackgroundMaintenanceTask(Darq darq, DarqMaintenanceBackgroundServiceSettings settings, + SimpleObjectPool messagePool, ILogger logger) + { + this.darq = darq; + this.settings = settings; + this.messagePool = messagePool; + this.logger = logger; + Reset(); + } + + private void Reset() + { + session = darq.DetachFromWorker(); + currentProducerClient = settings.producerFactory?.Invoke(settings.speculative ? new DprSession() : null); + completionTracker = new DarqCompletionTracker(); + iterator = darq.StartBackgroundScan(settings.speculative); + } + + public long ProcessingLag => darq.log.TailAddress - processedUpTo; + + public void Dispose() + { + iterator?.Dispose(); + currentProducerClient?.Dispose(); + } + + private unsafe bool TryReadEntry(out DarqMessage message) + { + message = null; + long nextAddress = 0; + + if (!iterator.UnsafeGetNext(out var entry, out var entryLength, + out var lsn, out processedUpTo, out var type)) + return false; + + completionTracker.AddEntry(lsn, processedUpTo); + // Short circuit without looking at the entry -- no need to process in background + if (type != DarqMessageType.OUT && type != DarqMessageType.COMPLETION) + { + iterator.UnsafeRelease(); + return true; + } + + // Copy out the entry before dropping protection + message = messagePool.Checkout(); + message.Reset(type, lsn, processedUpTo, + new ReadOnlySpan(entry, entryLength)); + iterator.UnsafeRelease(); + + return true; + } + + // TODO(Tianyu): Create variants that allow DARQ instances to talk with each other through more than just the FASTER wire protocol + private unsafe void SendMessage(DarqMessage m) + { + Debug.Assert(m.GetMessageType() == DarqMessageType.OUT); + var body = m.GetMessageBody(); + fixed (byte* h = body) + { + var dest = *(DarqId*)h; + var toSend = new ReadOnlySpan(h + sizeof(DarqId), + body.Length - sizeof(DarqId)); + var completionTrackerLocal = completionTracker; + var lsn = m.GetLsn(); + // TODO(Tianyu): Make ack more efficient through batching + currentProducerClient.EnqueueMessageWithCallback(dest, toSend, + _ => { completionTrackerLocal.RemoveEntry(lsn); }, darq.Me().guid, lsn); + if (++numBatched == settings.batchSize) + { + numBatched = 0; + currentProducerClient.ForceFlush(); + } + } + + m.Dispose(); + } + + private bool TryConsumeNext() + { + var hasNext = TryReadEntry(out var m); + // Don't go through the normal receive code path for performance + if (!darq.IsCompatible(session)) + { + logger.LogWarning("Processor detected rollback, restarting"); + Reset(); + // Reset to next iteration without doing anything + return true; + } + + if (!hasNext) return false; + // Not a message we care about + if (m == null) return true; + + switch (m.GetMessageType()) + { + case DarqMessageType.OUT: + { + SendMessage(m); + break; + } + case DarqMessageType.COMPLETION: + { + var body = m.GetMessageBody(); + unsafe + { + fixed (byte* h = body) + { + for (var completed = (long*)h; completed < h + body.Length; completed++) + completionTracker.RemoveEntry(*completed); + } + } + + completionTracker.RemoveEntry(m.GetLsn()); + m.Dispose(); + break; + } + default: + throw new NotImplementedException(); + } + + if (completionTracker.GetTruncateHead() > darq.log.BeginAddress) + { + // logger.LogInformation($"Truncating log until {completionTracker.GetTruncateHead()}"); + darq.StartLocalAction(); + darq.TruncateUntil(completionTracker.GetTruncateHead()); + darq.EndAction(); + } + + return true; + } + + internal async Task RunAsync(CancellationToken stoppingToken) + { + Console.WriteLine($"Starting background send from address {darq.log.BeginAddress}"); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + for (var i = 0; i < settings.morselSize; i++) + if (!TryConsumeNext()) + break; + + currentProducerClient?.ForceFlush(); + await iterator.WaitAsync(stoppingToken); + } + catch (Exception e) + { + // Just restart the failed background thread + logger.LogWarning($"Exception {e.Message} was thrown, restarting background worker"); + Reset(); + } + } + } + } + + public class DarqMaintenanceBackgroundService : BackgroundService + { + private SimpleObjectPool messagePool; + private ILogger logger; + private CancellationToken stoppingToken; + private ConcurrentDictionary dispatchedTasks = new(); + + private Darq defaultDarq; + private DarqMaintenanceBackgroundServiceSettings defaultSettings; + + public DarqMaintenanceBackgroundService(ILogger logger, + Darq defaultDarq = null, DarqMaintenanceBackgroundServiceSettings defaultSettings = null) + { + messagePool = new SimpleObjectPool(() => new DarqMessage(messagePool)); + this.logger = logger; + this.defaultDarq = defaultDarq; + this.defaultSettings = defaultSettings; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + logger.LogInformation("maintenance background service is starting"); + if (defaultDarq != null) + { + Debug.Assert(defaultSettings != null); + RegisterMaintenanceTask(defaultDarq, defaultSettings); + } + await Task.Delay(Timeout.Infinite, stoppingToken); + logger.LogInformation("stop signal received. maintenance background service is cleaning up..."); + + foreach (var task in dispatchedTasks.Keys) + { + if (!dispatchedTasks[task]) + await Task.Yield(); + task.Dispose(); + } + + logger.LogInformation("maintenance background service has finished clean-up, shutting down..."); + } + + public DarqBackgroundMaintenanceTask RegisterMaintenanceTask(Darq darq, DarqMaintenanceBackgroundServiceSettings settings) + { + if (stoppingToken.IsCancellationRequested) throw new TaskCanceledException(); + if ((defaultDarq != null && darq != defaultDarq) || (defaultSettings != null && settings != defaultSettings)) + throw new InvalidOperationException( + "Runtime creation of maintenance task is only allowed if no singleton default DARQ is configured"); + var task = new DarqBackgroundMaintenanceTask(darq, settings, messagePool, logger); + dispatchedTasks[task] = false; + Task.Run(async () => + { + logger.LogInformation($"maintenance background task for DARQ {darq.settings.Me} is starting"); + await task.RunAsync(stoppingToken); + dispatchedTasks[task] = true; + logger.LogInformation($"maintenance background task for DARQ {darq.settings.Me} exited"); + }); + return task; + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqCompletionTracker.cs b/cs/research/darq/FASTER.darq/DarqCompletionTracker.cs new file mode 100644 index 000000000..53604f983 --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqCompletionTracker.cs @@ -0,0 +1,98 @@ +using System; +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Linq; +using System.Threading; +using FASTER.common; +using FASTER.core; + +namespace FASTER.libdpr +{ + internal class DarqEntryBucket + { + internal long bucketNum; + internal long rangeEnd = 0; + internal int numIncompleteEntries = 0; + internal bool isSealed = false; + + public void Reset(long bucketNum) + { + this.bucketNum = bucketNum; + rangeEnd = 0; + numIncompleteEntries = 0; + isSealed = false; + } + } + + public class DarqCompletionTracker + { + private SimpleObjectPool bucketPool; + private DarqEntryBucket tailBucket; + private ConcurrentDictionary outstandingBuckets; + private ConcurrentQueue bucketQueue; + + private long truncatedHead = 0; + private int maxSectorRangeBits; + private int truncationInProgress; + + public DarqCompletionTracker(int maxSectorRangeBits = 15) + { + bucketPool = new SimpleObjectPool(() => new DarqEntryBucket()); + outstandingBuckets = new ConcurrentDictionary(); + bucketQueue = new ConcurrentQueue(); + this.maxSectorRangeBits = maxSectorRangeBits; + } + + public long GetTruncateHead() => truncatedHead; + + // Will only be invoked single-threaded + public void AddEntry(long start, long end) + { + while (tailBucket == null || start >> maxSectorRangeBits > tailBucket.bucketNum) + { + if (tailBucket != null) + tailBucket.isSealed = true; + tailBucket = bucketPool.Checkout(); + tailBucket.Reset(start >> maxSectorRangeBits); + outstandingBuckets.TryAdd(tailBucket.bucketNum, tailBucket); + bucketQueue.Enqueue(tailBucket); + } + + Debug.Assert(start >> maxSectorRangeBits >= tailBucket.bucketNum); + tailBucket.rangeEnd = Math.Max(tailBucket.rangeEnd, end); + Interlocked.Increment(ref tailBucket.numIncompleteEntries); + } + + public bool RemoveEntry(long start) + { + var ret = outstandingBuckets.TryGetValue(start >> maxSectorRangeBits, out var bucket); + if (!ret) + { + throw new FasterException("removing nonexistent entries from tracking"); + } + + if (Interlocked.Decrement(ref bucket.numIncompleteEntries) == 0 && bucket.isSealed) + return TryUpdateTruncateHead(); + return false; + } + + private bool TryUpdateTruncateHead() + { + // Ensure only one thread is truncating at a time + if (Interlocked.CompareExchange(ref truncationInProgress, 1, 0) != 0) return false; + + var changed = false; + while (bucketQueue.TryPeek(out var bucket)) + { + if (!bucket.isSealed || bucket.numIncompleteEntries != 0) break; + core.Utility.MonotonicUpdate(ref truncatedHead, bucket.rangeEnd, out _); + bucketQueue.TryDequeue(out _); + outstandingBuckets.TryRemove(bucket.bucketNum, out _); + bucketPool.Return(bucket); + changed = true; + } + truncationInProgress = 0; + return changed; + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqMessage.cs b/cs/research/darq/FASTER.darq/DarqMessage.cs new file mode 100644 index 000000000..8e842b033 --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqMessage.cs @@ -0,0 +1,472 @@ +using System.Diagnostics; +using System.Text; +using System.Text.Unicode; +using FASTER.common; +using FASTER.core; +using FASTER.darq; + +namespace FASTER.libdpr +{ + public enum DarqProtocolType : byte + { + DarqSubscribe = 125, + DarqProcessor = 126, + DarqProducer = 127 + } + + /// + /// DARQ Command Type + /// + public enum DarqCommandType : byte + { + INVALID = 0, + + /// + /// DARQ Enqueue + /// + DarqEnqueue, + + /// + /// DARQ Step + /// + DarqStep, + + /// + /// DARQ Register Processor + /// + DarqRegisterProcessor, + + /// + /// DARQ start push + /// + DarqStartPush, + } + + /// + /// DARQ Message Type + /// + public enum DarqMessageType : byte + { + /// + IN, + + /// + OUT, + + /// + RECOVERY, + + /// + COMPLETION, + } + + /// + /// DARQ Message + /// + public class DarqMessage : IDisposable + { + private DarqMessageType type; + private long lsn, nextLsn; + private byte[] message; + private int messageSize; + private SimpleObjectPool messagePool; + + /// + /// Create a new DarqMessage object + /// + /// (optional) the object pool this message should be returned to on disposal + public DarqMessage(SimpleObjectPool messagePool = null) + { + message = new byte[1 << 20]; + this.messagePool = messagePool; + } + + /// + public void Dispose() => messagePool?.Return(this); + + /// + /// Type of message + public DarqMessageType GetMessageType() => type; + + /// + /// LSN of the message + public long GetLsn() => lsn; + + /// + /// Lower bound for the LSN of the immediate next message in DARQ + public long GetNextLsn() => nextLsn; + + /// + /// Get the message bytes + public ReadOnlySpan GetMessageBody() => new(message, 0, messageSize); + + /// + /// Reset this message to hold supplied values instead + /// + /// + /// + /// + /// + public void Reset(DarqMessageType type, long lsn, long nextLsn, ReadOnlySpan msg) + { + this.type = type; + this.lsn = lsn; + this.nextLsn = nextLsn; + Debug.Assert(message.Length > msg.Length); + msg.CopyTo(message); + messageSize = msg.Length; + } + } + + /// + /// StepRequests represents a DARQ step + /// + public class StepRequest : IReadOnlySpanBatch + { + internal List consumedMessages; + internal List offsets; + internal int size; + internal byte[] serializationBuffer; + + /// + /// Create a new StepRequest object. + /// + /// (optional) the object pool this request should be returned to on disposal + public StepRequest() + { + serializationBuffer = new byte[1 << 15]; + consumedMessages = new List(); + offsets = new List(); + } + + internal void Reset() + { + consumedMessages.Clear(); + offsets.Clear(); + size = 0; + } + + /// + /// list of messages consumed in this step + public List ConsumedMessages() => consumedMessages; + + /// + public int TotalEntries() + { + return offsets.Count; + } + + /// + /// Grow the underlying serialization buffer to be double of its original size, in case the step no longer fits. + /// + public void Grow() + { + var oldBuffer = serializationBuffer; + serializationBuffer = new byte[2 * oldBuffer.Length]; + Array.Copy(oldBuffer, serializationBuffer, oldBuffer.Length); + } + + /// + public ReadOnlySpan Get(int index) + { + return new Span(serializationBuffer, offsets[index], + (index == (offsets.Count - 1) ? size : offsets[index + 1]) - offsets[index]); + } + } + + /// + /// Builder to populate StepRequest + /// + public struct StepRequestBuilder + { + private StepRequest request; + + /// + /// Constructs a new StepRequestBuilder + /// + /// the StepRequest object to populate + /// ID of the DARQ instance the step is for + public StepRequestBuilder(StepRequest toBuild) + { + request = toBuild; + request.Reset(); + } + + /// + /// Mark a message as consumed by this step + /// + /// LSN of the consumed message + /// self-reference for chaining + public StepRequestBuilder MarkMessageConsumed(long lsn) + { + request.consumedMessages.Add(lsn); + return this; + } + + /// + /// Add an out message to this step. + /// + /// Intended recipient + /// message body, in bytes + /// self-reference for chaining + public unsafe StepRequestBuilder AddOutMessage(DarqId recipient, ReadOnlySpan message) + { + while (request.serializationBuffer.Length - request.size < + message.Length + sizeof(DarqMessageType) + sizeof(DarqId)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + + *(DarqMessageType*)head++ = DarqMessageType.OUT; + *(DarqId*)head = recipient; + head += sizeof(DarqId); + + message.CopyTo(new Span(head, message.Length)); + head += message.Length; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddOutMessage(DarqId recipient, ILogEnqueueEntry message) + { + var messageLength = message.SerializedLength; + while (request.serializationBuffer.Length - request.size < + messageLength + sizeof(DarqMessageType) + sizeof(DarqId)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + + *(DarqMessageType*)head++ = DarqMessageType.OUT; + *(DarqId*)head = recipient; + head += sizeof(DarqId); + + message.SerializeTo(new Span(head, messageLength)); + head += messageLength; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddOutMessage(DarqId recipient, string message) + { + while (request.serializationBuffer.Length - request.size < + message.Length + sizeof(DarqMessageType) + sizeof(DarqId)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + + *(DarqMessageType*) head++ = DarqMessageType.OUT; + *(DarqId*) head = recipient; + head += sizeof(DarqId); + + var ret = Encoding.UTF8.GetBytes(message, new Span(head, message.Length)); + Debug.Assert(ret == message.Length); + head += message.Length; + request.size = (int) (head - b); + } + + return this; + } + + /// + /// Add an out message to this step. + /// + /// Intended recipient + /// message body, in bytes + /// self-reference for chaining + public unsafe StepRequestBuilder AddSelfMessage(ReadOnlySpan message) + { + while (request.serializationBuffer.Length - request.size < + message.Length + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.IN; + message.CopyTo(new Span(head, message.Length)); + head += message.Length; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddSelfMessage(long partitionId, ReadOnlySpan message) + { + while (request.serializationBuffer.Length - request.size < + sizeof(long) + message.Length + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.IN; + *(long*)head = partitionId; + head += sizeof(long); + message.CopyTo(new Span(head, message.Length)); + head += message.Length; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddSelfMessage(ILogEnqueueEntry message) + { + var messageLength = message.SerializedLength; + while (request.serializationBuffer.Length - request.size < + messageLength + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.IN; + message.SerializeTo(new Span(head, messageLength)); + head += messageLength; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddSelfMessage(string message) + { + while (request.serializationBuffer.Length - request.size < + message.Length + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + + *(DarqMessageType*) head++ = DarqMessageType.IN; + head += sizeof(DarqMessageType); + + Encoding.UTF8.GetBytes(message, new Span(head, message.Length)); + head += message.Length; + request.size = (int) (head - b); + } + + return this; + } + + + /// + /// Add a self message to this step. + /// + /// message body, as bytes + /// self-reference for chaining + public unsafe StepRequestBuilder AddRecoveryMessage(ReadOnlySpan message) + { + while (request.serializationBuffer.Length - request.size < + message.Length + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.RECOVERY; + message.CopyTo(new Span(head, message.Length)); + head += message.Length; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddRecoveryMessage(long partitionId, ReadOnlySpan message) + { + while (request.serializationBuffer.Length - request.size < + sizeof(long) + message.Length + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.RECOVERY; + *(long*)head = partitionId; + head += sizeof(long); + message.CopyTo(new Span(head, message.Length)); + head += message.Length; + request.size = (int)(head - b); + } + + return this; + } + + public unsafe StepRequestBuilder AddRecoveryMessage(ILogEnqueueEntry message) + { + var messageLength = message.SerializedLength; + + while (request.serializationBuffer.Length - request.size < + messageLength + sizeof(DarqMessageType)) + request.Grow(); + + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.RECOVERY; + message.SerializeTo(new Span(head, messageLength)); + head += messageLength; + request.size = (int)(head - b); + } + + return this; + } + + /// + /// Finishes a step for submission + /// + /// composed step object + public unsafe StepRequest FinishStep() + { + // Step needs to do something at least + if (request.consumedMessages.Count < 1 && request.offsets.Count == 0) + throw new FasterException("Empty step detected"); + + while (request.serializationBuffer.Length - request.size < + sizeof(DarqMessageType) + sizeof(long) * request.consumedMessages.Count) + request.Grow(); + + if (request.consumedMessages.Count != 0) + { + request.offsets.Add(request.size); + fixed (byte* b = request.serializationBuffer) + { + var head = b + request.size; + *(DarqMessageType*)head++ = DarqMessageType.COMPLETION; + foreach (var lsn in request.consumedMessages) + { + *(long*)head = lsn; + head += sizeof(long); + } + + request.size = (int)(head - b); + } + } + + return request; + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqProcessor.cs b/cs/research/darq/FASTER.darq/DarqProcessor.cs new file mode 100644 index 000000000..68995541d --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqProcessor.cs @@ -0,0 +1,46 @@ +using FASTER.darq; + +namespace FASTER.libdpr +{ + /// + /// IDarqProcessorClientCapabilities is supplied to DARQ processor implementations to provide access to DARQ features + /// + public interface IDarqProcessorClientCapabilities + { + /// + /// Performs a step as requested + /// + /// step request + /// status of the step + ValueTask Step(StepRequest request); + + DprSession GetDprSession(); + } + + + /// + /// A DARQ Processor is the key abstraction that encapsulates business logic attached to DARQ instances. + /// + public interface IDarqProcessor + { + /// + /// Process a new message intended for this DARQ instance + /// + /// the message. Should be explicitly disposed when no longer needed. + /// True if the processor should continue receiving messages. False if the processing loop should exit. + public bool ProcessMessage(DarqMessage m); + + /// + /// Invoked when the DARQ processor (re)starts processing, either because it is attached to a DARQ for the first + /// time or because it has to be restarted due to a failure. In the latter case, the processor should erase or + /// otherwise repair its in-memory local state, which may no longer be consistent. + /// + /// capabilities for use to interact with attached DARQ instance + public void OnRestart(IDarqProcessorClientCapabilities capabilities); + } + + public interface IDarqProcessorClient + { + public Task StartProcessingAsync(T processor, CancellationToken token) where T : IDarqProcessor; + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqScanIterator.cs b/cs/research/darq/FASTER.darq/DarqScanIterator.cs new file mode 100644 index 000000000..44d857f80 --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqScanIterator.cs @@ -0,0 +1,175 @@ +using System.Runtime.InteropServices; +using FASTER.core; + +namespace FASTER.libdpr +{ + /// + /// Iterator for scanning a DARQ + /// + public class DarqScanIterator : IDisposable + { + private FasterLogScanIterator iterator; + private long replayEnd; + private Queue<(long, long, byte[])> recoveryMessages; + private Dictionary replayMessages; + private bool disposed = false; + private byte[] reusedReadBuffer; + private GCHandle? handle = null; + private bool hideRecoveryMessages; + + internal DarqScanIterator(FasterLog log, long replayEnd, bool speculative, bool hideRecoveryMessages = true) + { + iterator = log.Scan(0, long.MaxValue, scanUncommitted: speculative); + recoveryMessages = new Queue<(long, long, byte[])>(); + replayMessages = new Dictionary(); + this.replayEnd = replayEnd; + this.hideRecoveryMessages = hideRecoveryMessages; + ScanOnRecovery(); + } + + /// > + public void Dispose() + { + disposed = true; + iterator.Dispose(); + } + + private unsafe void ScanOnRecovery() + { + while (true) + { + while (iterator.UnsafeGetNext(out var entry, out var length, out var currentAddress, + out var nextAddress)) + { + // Should not be inclusive -- replay end is the start address of the last completion record in stepped + if (currentAddress > replayEnd) + { + Console.WriteLine( + $"Current addr {currentAddress} is beyond replay end {replayEnd}, finishing processor recovery..."); + iterator.UnsafeRelease(); + break; + } + + switch (*(DarqMessageType *) entry) + { + case DarqMessageType.OUT: + break; + case DarqMessageType.RECOVERY: + recoveryMessages.Enqueue((currentAddress, nextAddress, + new Span(entry, length).ToArray())); + break; + case DarqMessageType.IN: + replayMessages.Add(currentAddress, length); + break; + case DarqMessageType.COMPLETION: + var completed = (long*)(entry + sizeof(DarqMessageType)); + while (completed < entry + length) + { + var completedLsn = *completed++; + replayMessages.Remove(completedLsn); + } + break; + default: + throw new NotImplementedException(); + } + + iterator.UnsafeRelease(); + } + + if (iterator.NextAddress >= replayEnd) break; + iterator.WaitAsync().AsTask().GetAwaiter().GetResult(); + } + iterator.Reset(); + } + + /// + /// Scan the next entry in DARQ. If successful, must be followed by a UnsafeRelease call to release any + /// resources held in-place for unsafe consumption. + /// + /// pointer to the start of next entry body + /// length of the next entry + /// address of the entry on DARQ (lsn) + /// lower bound of the address of the next entry on DARQ + /// type of entry + /// whether a next entry is available at this moment + public unsafe bool UnsafeGetNext(out byte* entry, out int entryLength, out long currentAddress, + out long nextAddress, out DarqMessageType type) + { + if (handle.HasValue) + throw new FasterException("Trying to get next without release previous"); + type = default; + + // Try to replay state messages first + if (recoveryMessages.Count != 0) + { + while (recoveryMessages.TryDequeue(out var m)) + { + currentAddress = m.Item1; + nextAddress = m.Item2; + handle = GCHandle.Alloc(m.Item3, GCHandleType.Pinned); + type = DarqMessageType.RECOVERY; + entry = (byte*)handle.Value.AddrOfPinnedObject(); + entryLength = m.Item3.Length; + return true; + } + } + + while (true) + { + if (!iterator.UnsafeGetNext(out entry, out entryLength, out currentAddress, out nextAddress)) + return false; + + type = (DarqMessageType) (*entry); + switch (type) + { + case DarqMessageType.IN: + if (currentAddress > replayEnd) break; + // If still replaying messages, only allow messages that should be replayed to go through + if (replayMessages.Remove(currentAddress)) break; + // Otherwise, skip this message because there is a later completion message + iterator.UnsafeRelease(); + continue; + case DarqMessageType.OUT: + case DarqMessageType.COMPLETION: + break; + // Should be seen by DARQ consumer only if requested and not replayed + case DarqMessageType.RECOVERY: + if (!hideRecoveryMessages && currentAddress > replayEnd) break; + iterator.UnsafeRelease(); + continue; + default: + throw new FasterException("Unexpected entry type"); + } + + // Skip header byte + entry += sizeof(byte); + entryLength -= 1; + return true; + } + } + + /// + /// Releases resources held from a previous successful UnsafeGetNext call + /// + public void UnsafeRelease() + { + if (handle.HasValue) + { + handle.Value.Free(); + handle = null; + } + else + iterator.UnsafeRelease(); + } + + /// + /// Wait until the next entry is available or when no more entries will be available + /// + /// cancellation token + /// task for the availability of next entry + public ValueTask WaitAsync(CancellationToken token = default) + { + return iterator.WaitAsync(token); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DarqSettings.cs b/cs/research/darq/FASTER.darq/DarqSettings.cs new file mode 100644 index 000000000..41cd37d21 --- /dev/null +++ b/cs/research/darq/FASTER.darq/DarqSettings.cs @@ -0,0 +1,159 @@ +using darq; +using FASTER.core; +using FASTER.libdpr; + +namespace FASTER.darq +{ + + /// + /// Each DARQ instance is uniquely numbered within a cluster for identification and routing + /// + public struct DarqId : IEquatable + { + + /// + /// globally-unique worker ID within a DPR cluster + /// + public readonly long guid; + + /// + /// Constructs a worker with the given guid + /// + /// worker guid + public DarqId(long guid) + { + this.guid = guid; + } + + public readonly bool Equals(DarqId other) + { + return guid == other.guid; + } + + public static bool operator ==(DarqId left, DarqId right) + { + return left.Equals(right); + } + + public static bool operator !=(DarqId left, DarqId right) + { + return !left.Equals(right); + } + + /// + public override bool Equals(object obj) + { + return obj is DarqId other && Equals(other); + } + + /// + public override int GetHashCode() + { + return guid.GetHashCode(); + } + } + + /// + /// DARQ Settings + /// + public class DarqSettings + { + /// + /// The DPRFinder for the cluster this DARQ should connect to, or null if you will only use DARQ + /// non-speculatively and do not wish to connect to a DPR cluster. If a non-null DPRFinder is supplied, + /// DARQ will operate in speculative mode. + /// + public IDprFinder DprFinder = null; + + public DarqId Me = new(0); + + public DprWorkerId MyDpr = DprWorkerId.INVALID; + + public long CheckpointPeriodMilli = 5; + + public long RefreshPeriodMilli = 5; + + /// + /// Device used for underlying log + /// + public IDevice LogDevice; + + + /// + /// Size of a page in the underlying log, in bytes. Must be a power of 2. + /// + public long PageSize = 1L << 22; + + /// + /// Total size of in-memory part of log, in bytes. Must be a power of 2. + /// Should be at least one page long + /// Num pages = 2^(MemorySizeBits-PageSizeBits) + /// + public long MemorySize = 1L << 23; + + + /// + /// Size of a segment (group of pages), in bytes. Must be a power of 2. + /// This is the granularity of files on disk + /// + public long SegmentSize = 1L << 30; + + /// + /// Log commit manager - if you want to override the default implementation of commit. + /// + public ILogCommitManager LogCommitManager = null; + + /// + /// Use specified directory (path) as base for storing and retrieving underlying log commits. By default, + /// commits will be stored in a folder named log-commits under this directory. If not provided, + /// we use the base path of the log device by default. + /// + public string LogCommitDir = null; + + /// + /// Type of checksum to add to log + /// + public LogChecksumType LogChecksum = LogChecksumType.None; + + /// + /// Fraction of underlying log marked as mutable (uncommitted) + /// + public double MutableFraction = 0; + + /// + /// When FastCommitMode is enabled, FasterLog will reduce commit critical path latency, but may result in slower + /// recovery to a commit on restart. Additionally, FastCommitMode is only possible when log checksum is turned + /// on. + /// + public bool FastCommitMode = false; + + /// + /// When DeleteOnClose is true, DARQ will remove all persistent state on shutdown -- useful for testing but + /// will result in data loss otherwise, + /// + public bool DeleteOnClose = false; + + /// + /// When CleanStart is true, DARQ will remove all persistent previous state on startup -- useful for testing but + /// will result in data loss otherwise + /// + public bool CleanStart = false; + + /// + /// Create default configuration settings for DARQ. You need to create and specify LogDevice + /// explicitly with this API. + /// Use Utility.ParseSize to specify sizes in familiar string notation (e.g., "4k" and "4 MB"). + /// + public DarqSettings() { } + + /// + public override string ToString() + { + var retStr = $"log memory: {Utility.PrettySize(MemorySize)}; log page: {Utility.PrettySize(PageSize)}; log segment: {Utility.PrettySize(SegmentSize)}"; + retStr += $"; log device: {(LogDevice == null ? "null" : LogDevice.GetType().Name)}"; + retStr += $"; mutable fraction: {MutableFraction}; fast commit mode: {(FastCommitMode ? "yes" : "no")}"; + retStr += $"; delete on close: {(DeleteOnClose ? "yes" : "no")}"; + return retStr; + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/DeduplicationVector.cs b/cs/research/darq/FASTER.darq/DeduplicationVector.cs new file mode 100644 index 000000000..5e2c7cc9f --- /dev/null +++ b/cs/research/darq/FASTER.darq/DeduplicationVector.cs @@ -0,0 +1,68 @@ +using System; +using System.Threading; + +namespace FASTER.libdpr +{ + internal class DeduplicationVector : IStateObjectAttachment + { + private const int MAX_SIZE = 32; + private long[] dvc = new long[MAX_SIZE]; + private int used = 0; + + public DeduplicationVector() + { + for (var i = 0; i < MAX_SIZE; i++) + dvc[i] = -1; + } + + public bool Process(long id, long lsn) + { + var result = core.Utility.MonotonicUpdate(ref dvc[id], lsn, out var old); + if (old == -1) + Interlocked.Increment(ref used); + return result; + } + + public int SerializedSize() + { + return used * 2 * sizeof(long) + sizeof(int); + } + + public void SerializeTo(Span buffer) + { + var head = 0; + BitConverter.TryWriteBytes(buffer.Slice(head), used); + head += sizeof(int); + for (var i = 0; i < MAX_SIZE; i++) + { + if (dvc[i] == -1) continue; + BitConverter.TryWriteBytes(buffer.Slice(head), (long) i); + head += sizeof(long); + BitConverter.TryWriteBytes(buffer.Slice(head), dvc[i]); + head += sizeof(long); + } + } + + public void RecoverFrom(ReadOnlySpan serialized) + { + unsafe + { + fixed (byte* b = serialized) + { + var head = b; + var count = *(int*) head; + head += sizeof(int); + for (var i = 0; i < count; i++) + { + var worker = *(long*) head; + head += sizeof(long); + var val = *(long*) head; + head += sizeof(long); + dvc[worker] = val; + } + + } + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/FASTER.darq.csproj b/cs/research/darq/FASTER.darq/FASTER.darq.csproj new file mode 100644 index 000000000..38b801c81 --- /dev/null +++ b/cs/research/darq/FASTER.darq/FASTER.darq.csproj @@ -0,0 +1,29 @@ + + + + net7.0 + enable + true + darq + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + diff --git a/cs/research/darq/FASTER.darq/IDarqProducer.cs b/cs/research/darq/FASTER.darq/IDarqProducer.cs new file mode 100644 index 000000000..cee0b52a8 --- /dev/null +++ b/cs/research/darq/FASTER.darq/IDarqProducer.cs @@ -0,0 +1,11 @@ +using FASTER.darq; + +namespace darq.client; + +public interface IDarqProducer : IDisposable +{ + public void EnqueueMessageWithCallback(DarqId darqId, ReadOnlySpan message, Action callback, + long producerId, long lsn); + + public void ForceFlush(); +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/SerializedDarqEntryBatch.cs b/cs/research/darq/FASTER.darq/SerializedDarqEntryBatch.cs new file mode 100644 index 000000000..99c08c1e7 --- /dev/null +++ b/cs/research/darq/FASTER.darq/SerializedDarqEntryBatch.cs @@ -0,0 +1,79 @@ +using System; +using FASTER.core; + +namespace FASTER.libdpr +{ + public unsafe struct SerializedDarqEntryBatch : IReadOnlySpanBatch + { + private byte* head; + + public static int ComputeSerializedSize(ReadOnlySpan message) + { + // For single messages -- insert in message type + return 2 * sizeof(int) + sizeof(byte) + message.Length; + } + + public static int ComputeSerializedSize(IReadOnlySpanBatch original) + { + // No need to add additional message type for batched interface + var size = (original.TotalEntries() + 1) * sizeof(int); + for (var i = 0; i < original.TotalEntries(); i++) + { + size += original.Get(i).Length; + } + return size; + } + + public SerializedDarqEntryBatch(byte* head) + { + this.head = head; + } + + public int TotalSize() + { + return ((int*) head)[TotalEntries()]; + } + + public int TotalEntries() + { + return *(int*) head; + } + + public ReadOnlySpan Get(int index) + { + var offsetStart = index == 0 ? sizeof(int) * (TotalEntries() + 1) : ((int*) head)[index]; + var offsetEnd = ((int*) head)[index + 1]; + + return new ReadOnlySpan(head + offsetStart, offsetEnd - offsetStart); + } + + public void SetContent(ReadOnlySpan message) + { + var writeHead = head; + *(int*) writeHead = 1; + writeHead += sizeof(int); + + *(int*) writeHead = sizeof(int) * 2 + sizeof(byte) + message.Length; + writeHead += sizeof(int); + + *(DarqMessageType*) writeHead = DarqMessageType.IN; + writeHead += sizeof(DarqMessageType); + + message.CopyTo(new Span(writeHead, message.Length)); + } + + public void SetContent(IReadOnlySpanBatch batch) + { + *(int*) head = batch.TotalEntries(); + + for (var i = 0; i < batch.TotalEntries(); i++) + { + var entry = batch.Get(i); + var offsetStart = i == 0 ? sizeof(int) * (TotalEntries() + 1) : ((int*) head)[i]; + ((int*) head)[i + 1] = offsetStart + entry.Length; + + entry.CopyTo(new Span(head + offsetStart, entry.Length)); + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/client/ColocatedDarqProcessorClient.cs b/cs/research/darq/FASTER.darq/client/ColocatedDarqProcessorClient.cs new file mode 100644 index 000000000..5bfc258d3 --- /dev/null +++ b/cs/research/darq/FASTER.darq/client/ColocatedDarqProcessorClient.cs @@ -0,0 +1,177 @@ +using System.Diagnostics; +using System.Runtime.CompilerServices; +using FASTER.common; +using FASTER.libdpr; + +namespace FASTER.darq +{ + /// + /// A DarqConsumer that runs in the same process as a DARQ instance + /// + public class ColocatedDarqProcessorClient : IDarqProcessorClient + { + private Darq darq; + private SimpleObjectPool messagePool; + private ManualResetEventSlim terminationComplete; + + // TODO(Tianyu): Reason about behavior in the case of rollback + public long incarnation; + private DarqScanIterator iterator; + private DprSession session; + private Capabilities capabilities; + + private bool speculative; + + private enum ProcessResult + { + CONTINUE, + NO_ENTRY, + TERMINATED + } + + private class Capabilities : IDarqProcessorClientCapabilities + { + private readonly ColocatedDarqProcessorClient parent; + private DprSession session; + + public Capabilities(ColocatedDarqProcessorClient parent) + { + this.parent = parent; + session = parent.session; + } + + public async ValueTask Step(StepRequest request) + { + // If step results in a version mismatch, rely on the scan to trigger a rollback for simplicity + if (!await parent.darq.TakeOnDependencyAndStartActionAsync(session)) + return StepStatus.REINCARNATED; + var status = parent.darq.Step(parent.incarnation, request); + parent.darq.EndAction(); + return status; + } + + public DprSession GetDprSession() => session; + } + + /// + /// Constructs a new ColocatedDarqProcessorClient + /// + /// DARQ DprServer that this consumer attaches to + /// information about the DARQ cluster + public ColocatedDarqProcessorClient(Darq darq, bool speculative) + { + this.darq = darq; + messagePool = new SimpleObjectPool(() => new DarqMessage(messagePool)); + this.speculative = speculative; + } + + public void Dispose() + { + messagePool.Dispose(); + iterator?.Dispose(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TryReadEntry(out DarqMessage message) + { + message = null; + if (!iterator.UnsafeGetNext(out var entry, out var entryLength, + out var lsn, out var nextLsn, out var type)) + return false; + + // Short circuit without looking at the entry -- no need to process in background + if (type != DarqMessageType.IN && type != DarqMessageType.RECOVERY) + { + iterator.UnsafeRelease(); + return true; + } + + // Copy out the entry before dropping protection + message = messagePool.Checkout(); + message.Reset(type, lsn, nextLsn, new ReadOnlySpan(entry, entryLength)); + iterator.UnsafeRelease(); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private ProcessResult TryConsumeNext(T processor) where T : IDarqProcessor + { + try + { + var hasNext = TryReadEntry(out var m); + if (!hasNext) + return ProcessResult.NO_ENTRY; + // Not a message we need to worry about + if (m == null) return ProcessResult.CONTINUE; + + session.DependOn(darq); + switch (m.GetMessageType()) + { + case DarqMessageType.IN: + case DarqMessageType.RECOVERY: + if (processor.ProcessMessage(m)) + return ProcessResult.CONTINUE; + return ProcessResult.TERMINATED; + default: + throw new NotImplementedException(); + } + } + catch (DprSessionRolledBackException) + { + Console.WriteLine("Processor detected rollback, restarting"); + OnProcessorClientRestart(processor); + // Reset to next iteration without doing anything + return ProcessResult.CONTINUE; + } + } + + private void OnProcessorClientRestart(T processor) where T : IDarqProcessor + { + session = new DprSession(); + capabilities = new Capabilities(this); + processor.OnRestart(capabilities); + iterator = darq.StartScan(speculative); + } + + /// + public async Task StartProcessingAsync(T processor, CancellationToken token) + where T : IDarqProcessor + { + try + { + terminationComplete = new ManualResetEventSlim(); + incarnation = darq.RegisterNewProcessor(); + OnProcessorClientRestart(processor); + Console.WriteLine("Starting Processor..."); + while (!token.IsCancellationRequested) + { + ProcessResult result; + do + { + result = TryConsumeNext(processor); + } while (result == ProcessResult.CONTINUE); + + if (result == ProcessResult.TERMINATED) + break; + + // FASTER.darq.StateObject().RefreshSafeReadTail(); + try + { + await iterator.WaitAsync(token); + } + catch (OperationCanceledException) {} + } + + Console.WriteLine($"Colocated processor has exited on worker {darq.Me().guid}"); + terminationComplete.Set(); + } + catch (Exception e) + { + Console.WriteLine("C# why you eat exceptions"); + Console.WriteLine(e.Message); + Console.WriteLine(e.StackTrace); + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/client/DarqClientNetworkSession.cs b/cs/research/darq/FASTER.darq/client/DarqClientNetworkSession.cs new file mode 100644 index 000000000..53de12d95 --- /dev/null +++ b/cs/research/darq/FASTER.darq/client/DarqClientNetworkSession.cs @@ -0,0 +1,65 @@ +using System; +using System.Net.Sockets; + +namespace FASTER.client +{ + internal interface INetworkMessageConsumer + { + void ProcessReplies(byte[] buf, int offset, int size); + } + + internal class DarqClientNetworkSession where T : INetworkMessageConsumer + { + internal readonly Socket socket; + internal readonly T client; + + private int bytesRead; + private int readHead; + + public DarqClientNetworkSession(Socket socket, T client) + { + this.socket = socket; + this.client = client; + bytesRead = 0; + readHead = 0; + } + + internal void AddBytesRead(int bytesRead) => this.bytesRead += bytesRead; + + internal int TryConsumeMessages(byte[] buf) + { + while (TryReadMessages(buf, out var offset, out var size)) + client.ProcessReplies(buf, offset, size); + + // The bytes left in the current buffer not consumed by previous operations + var bytesLeft = bytesRead - readHead; + if (bytesLeft != bytesRead) + { + // Shift them to the head of the array so we can reset the buffer to a consistent state + Array.Copy(buf, readHead, buf, 0, bytesLeft); + bytesRead = bytesLeft; + readHead = 0; + } + + return bytesRead; + } + + private bool TryReadMessages(byte[] buf, out int offset, out int size) + { + offset = default; + size = default; + var bytesAvailable = bytesRead - readHead; + // Need to at least have read off of size field on the message + if (bytesAvailable < sizeof(int)) return false; + + size = -BitConverter.ToInt32(buf, readHead); + // Not all of the message has arrived + if (bytesAvailable < size + sizeof(int)) return false; + offset = readHead + sizeof(int); + + // Consume this message and the header + readHead += size + sizeof(int); + return true; + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/client/DarqProcessorClient.cs b/cs/research/darq/FASTER.darq/client/DarqProcessorClient.cs new file mode 100644 index 000000000..ce0e15f9d --- /dev/null +++ b/cs/research/darq/FASTER.darq/client/DarqProcessorClient.cs @@ -0,0 +1,561 @@ +using System.Diagnostics; +using System.Net; +using System.Net.Sockets; +using System.Runtime.CompilerServices; +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; + +namespace FASTER.client +{ + internal class DarqProcessorWriteClient : IDisposable, INetworkMessageConsumer + { + private DprSession dprSession; + private readonly INetworkSender networkSender; + + // TODO(Tianyu): Change to something else for DARQ + private readonly MaxSizeSettings maxSizeSettings; + readonly int bufferSize; + + private bool disposed; + private int offset; + private int numMessages; + private readonly int maxOutstanding; + private volatile int numOutstanding; + private const int reservedDprHeaderSpace = 80; + + private TaskCompletionSource outstandingRegistrationRequest; + private ElasticCircularBuffer> outstandingStepQueue = new(); + + public DarqProcessorWriteClient(DprSession dprSession, string address, int port, int maxOutstanding) + { + this.dprSession = dprSession; + maxSizeSettings = new MaxSizeSettings(); + bufferSize = BufferSizeUtils.ClientBufferSize(maxSizeSettings); + + networkSender = new TcpNetworkSender(GetSendSocket(address, port), maxSizeSettings); + networkSender.GetResponseObject(); + offset = 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size; + numMessages = 0; + this.maxOutstanding = maxOutstanding; + } + + public void Dispose() + { + disposed = true; + networkSender.Dispose(); + } + + public unsafe void Flush() + { + try + { + if (offset > 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size) + { + var head = networkSender.GetResponseObjectHead(); + // Set packet size in header + *(int*)head = -(offset - sizeof(int)); + head += sizeof(int); + + ((BatchHeader*)head)->SetNumMessagesProtocol(numMessages, + (WireFormat)DarqProtocolType.DarqProcessor); + head += sizeof(BatchHeader); + + // Set DprHeader size + *(int*)head = reservedDprHeaderSpace; + head += sizeof(int); + + // populate DPR header + var headerBytes = new Span(head, reservedDprHeaderSpace); + if (dprSession.TagMessage(headerBytes) < 0) + // TODO(Tianyu): Handle size mismatch by probably copying into a new array and up-ing reserved space in the future + throw new NotImplementedException(); + + Interlocked.Add(ref numOutstanding, numMessages); + while (numOutstanding >= maxOutstanding) + { + // Expecting a fairly quick turn around, so just spin + } + + networkSender.SendResponse(0, offset); + networkSender.GetResponseObject(); + offset = 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size; + numMessages = 0; + } + } + catch (DprSessionRolledBackException) + { + // Ensure that callback queue is drained only on a single-thread. This is not a scalability issue + // because except in the event of a rollback, callback queue is not concurrently accessed + lock (outstandingStepQueue) + { + outstandingRegistrationRequest?.SetCanceled(); + while (!outstandingStepQueue.IsEmpty()) + outstandingStepQueue.Dequeue().SetResult(StepStatus.REINCARNATED); + } + throw; + } + } + + public unsafe Task Step(StepRequest stepRequest, long incarnation, bool forceFlush = true) + { + byte* curr, end; + var entryBatchSize = SerializedDarqEntryBatch.ComputeSerializedSize(stepRequest); + while (true) + { + end = networkSender.GetResponseObjectHead() + bufferSize; + curr = networkSender.GetResponseObjectHead() + offset; + var serializedSize = sizeof(byte) + sizeof(long) * 2 + entryBatchSize; + if (end - curr >= serializedSize && numMessages < maxOutstanding) break; + Flush(); + } + + *curr = (byte) DarqCommandType.DarqStep; + curr += sizeof(byte); + + *(long*) curr = incarnation; + curr += sizeof(long); + + var batch = new SerializedDarqEntryBatch(curr); + batch.SetContent(stepRequest); + curr += entryBatchSize; + offset = (int) (curr - networkSender.GetResponseObjectHead()); + numMessages++; + var result = new TaskCompletionSource(); + outstandingStepQueue.Enqueue(result); + if (forceFlush) Flush(); + return result.Task; + } + + public unsafe long RegisterProcessor() + { + Debug.Assert(outstandingRegistrationRequest == null); + byte* curr, end; + while (true) + { + end = networkSender.GetResponseObjectHead() + bufferSize; + curr = networkSender.GetResponseObjectHead() + offset; + var serializedSize = sizeof(byte); + if (end - curr >= serializedSize) break; + Flush(); + } + + *curr = (byte) DarqCommandType.DarqRegisterProcessor; + curr += sizeof(byte); + + offset = (int) (curr - networkSender.GetResponseObjectHead()); + numMessages++; + outstandingRegistrationRequest = new TaskCompletionSource(); + Flush(); + var incarnation = outstandingRegistrationRequest.Task.GetAwaiter().GetResult(); + outstandingRegistrationRequest = null; + return incarnation; + } + + unsafe void INetworkMessageConsumer.ProcessReplies(byte[] buf, int offset, int size) + { + fixed (byte* b = buf) + { + var src = b + offset; + var batchHeader = *(BatchHeader*) src; + src += sizeof(BatchHeader); + + var dprHeader = new ReadOnlySpan(src, DprMessageHeader.FixedLenSize); + src += DprMessageHeader.FixedLenSize; + + // Ensure that callback queue is drained only on a single-thread. This is not a scalability issue + // because except in the event of a rollback, callback queue is not concurrently accessed + lock (outstandingStepQueue) + { + try + { + if (!dprSession.Receive(dprHeader)) return; + + // TODO(Tianyu): Handle consumer id mismatch cases + for (var i = 0; i < batchHeader.NumMessages; i++) + { + var type = *(DarqCommandType*)src; + src += sizeof(DarqCommandType); + switch (type) + { + case DarqCommandType.DarqStep: + var stepStatus = *(StepStatus*)src; + src += sizeof(StepStatus); + if (stepStatus == StepStatus.REINCARNATED) + // TODO: Terminate execution gracefully here + throw new NotImplementedException(); + var request = outstandingStepQueue.Dequeue(); + Interlocked.Decrement(ref numOutstanding); + request.SetResult(stepStatus); + break; + case DarqCommandType.DarqRegisterProcessor: + Debug.Assert(outstandingRegistrationRequest != null); + outstandingRegistrationRequest.SetResult(*(long*)src); + outstandingRegistrationRequest = null; + break; + default: + throw new NotImplementedException(); + } + } + } + catch (DprSessionRolledBackException) + { + outstandingRegistrationRequest?.SetCanceled(); + while (!outstandingStepQueue.IsEmpty()) + outstandingStepQueue.Dequeue().SetResult(StepStatus.REINCARNATED); + } + } + } + } + + private Socket GetSendSocket(string address, int port, int millisecondsTimeout = -2) + { + var ip = IPAddress.Parse(address); + var endPoint = new IPEndPoint(ip, port); + var socket = new Socket(ip.AddressFamily, SocketType.Stream, ProtocolType.Tcp) + { + NoDelay = true + }; + + if (millisecondsTimeout != -2) + { + IAsyncResult result = socket.BeginConnect(endPoint, null, null); + result.AsyncWaitHandle.WaitOne(millisecondsTimeout, true); + if (socket.Connected) + socket.EndConnect(result); + else + { + socket.Close(); + throw new Exception("Failed to connect server."); + } + } + else + { + socket.Connect(endPoint); + } + + // Ok to create new event args on accept because we assume a connection to be long-running + var receiveEventArgs = new SocketAsyncEventArgs(); + var bufferSize = BufferSizeUtils.ServerBufferSize(maxSizeSettings); + receiveEventArgs.SetBuffer(new byte[bufferSize], 0, bufferSize); + receiveEventArgs.UserToken = new DarqClientNetworkSession(socket, this); + receiveEventArgs.Completed += RecvEventArg_Completed; + var response = socket.ReceiveAsync(receiveEventArgs); + Debug.Assert(response); + return socket; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool HandleReceiveCompletion(SocketAsyncEventArgs e) + { + var connState = (DarqClientNetworkSession) e.UserToken; + if (e.BytesTransferred == 0 || e.SocketError != SocketError.Success || disposed) + { + connState.socket.Dispose(); + e.Dispose(); + return false; + } + + connState.AddBytesRead(e.BytesTransferred); + var newHead = connState.TryConsumeMessages(e.Buffer); + if (newHead == e.Buffer.Length) + { + // Need to grow input buffer + var newBuffer = new byte[e.Buffer.Length * 2]; + Array.Copy(e.Buffer, newBuffer, e.Buffer.Length); + e.SetBuffer(newBuffer, newHead, newBuffer.Length - newHead); + } + else + e.SetBuffer(newHead, e.Buffer.Length - newHead); + + return true; + } + + private void RecvEventArg_Completed(object sender, SocketAsyncEventArgs e) + { + try + { + var connState = (DarqClientNetworkSession) e.UserToken; + do + { + // No more things to receive + if (!HandleReceiveCompletion(e)) break; + } while (!connState.socket.ReceiveAsync(e)); + } + // ignore session socket disposed due to client session dispose + catch (ObjectDisposedException) + { + } + } + } + + internal class DarqProcessorReadClient : IDisposable, INetworkMessageConsumer + { + internal ElasticCircularBuffer pendingMessages; + internal SimpleObjectPool messagePool; + private MaxSizeSettings maxSizeSettings; + private readonly INetworkSender networkSender; + private bool disposed; + private int maxBuffered; + private DprSession session; + private bool rolledBack = false; + + public DarqProcessorReadClient(DprSession session, string address, int port, int maxBuffered) + { + maxSizeSettings = new MaxSizeSettings(); + networkSender = new TcpNetworkSender(GetSendSocket(address, port), maxSizeSettings); + this.maxBuffered = maxBuffered; + messagePool = new SimpleObjectPool(() => new DarqMessage(messagePool), 2 * maxBuffered); + pendingMessages = new ElasticCircularBuffer(); + this.session = session; + } + + public void Dispose() + { + disposed = true; + networkSender.Dispose(); + } + + public unsafe void StartReceivePush() + { + var offset = sizeof(int) + BatchHeader.Size; + var numMessages = 0; + networkSender.GetResponseObject(); + var curr = networkSender.GetResponseObjectHead() + offset; + *curr = (byte) DarqCommandType.DarqStartPush; + curr += sizeof(byte); + *curr = 1; + curr += sizeof(byte); + + offset = (int) (curr - networkSender.GetResponseObjectHead()); + numMessages++; + var head = networkSender.GetResponseObjectHead(); + // Set packet size in header + *(int*) head = -(offset - sizeof(int)); + head += sizeof(int); + + ((BatchHeader*) head)->SetNumMessagesProtocol(numMessages, (WireFormat) DarqProtocolType.DarqSubscribe); + + networkSender.SendResponse(0, offset); + } + + unsafe void INetworkMessageConsumer.ProcessReplies(byte[] buf, int offset, int size) + { + if (rolledBack) return; + + fixed (byte* b = buf) + { + var src = b + offset; + + var count = ((BatchHeader*) src)->NumMessages; + src += BatchHeader.Size; + + var dprOffset = *(int*) src; + src += sizeof(int); + + var dprHeaderSize = *(int*) (src + dprOffset); + var dprHeader = new ReadOnlySpan(src + dprOffset + sizeof(int), dprHeaderSize); + try + { + if (!session.Receive(dprHeader)) return; + + for (int i = 0; i < count; i++) + { + var lsn = *(long*)src; + src += sizeof(long); + var nextLsn = *(long*)src; + src += sizeof(long); + var type = *(DarqMessageType*)src; + src += sizeof(DarqMessageType); + var len = *(int*)src; + src += sizeof(int); + Debug.Assert(type is DarqMessageType.IN or DarqMessageType.RECOVERY); + var m = messagePool.Checkout(); + m.Reset(type, lsn, nextLsn, new ReadOnlySpan(src, len)); + pendingMessages.Enqueue(m); + src += len; + } + } + catch (DprSessionRolledBackException) + { + var m = messagePool.Checkout(); + // Use a special message to notify of rollback and then go to a sink state + m.Reset(DarqMessageType.IN, -1, -1, ReadOnlySpan.Empty); + pendingMessages.Enqueue(m); + rolledBack = true; + } + } + } + + private Socket GetSendSocket(string address, int port, int millisecondsTimeout = -2) + { + var ip = IPAddress.Parse(address); + var endPoint = new IPEndPoint(ip, port); + var socket = new Socket(ip.AddressFamily, SocketType.Stream, ProtocolType.Tcp) + { + NoDelay = true + }; + + if (millisecondsTimeout != -2) + { + IAsyncResult result = socket.BeginConnect(endPoint, null, null); + result.AsyncWaitHandle.WaitOne(millisecondsTimeout, true); + if (socket.Connected) + socket.EndConnect(result); + else + { + socket.Close(); + throw new Exception("Failed to connect server."); + } + } + else + { + socket.Connect(endPoint); + } + + // Ok to create new event args on accept because we assume a connection to be long-running + var receiveEventArgs = new SocketAsyncEventArgs(); + var bufferSize = BufferSizeUtils.ServerBufferSize(maxSizeSettings); + receiveEventArgs.SetBuffer(new byte[bufferSize], 0, bufferSize); + receiveEventArgs.UserToken = new DarqClientNetworkSession(socket, this); + receiveEventArgs.Completed += RecvEventArg_Completed; + var response = socket.ReceiveAsync(receiveEventArgs); + Debug.Assert(response); + return socket; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool HandleReceiveCompletion(SocketAsyncEventArgs e) + { + var connState = (DarqClientNetworkSession) e.UserToken; + if (e.BytesTransferred == 0 || e.SocketError != SocketError.Success || disposed) + { + connState.socket.Dispose(); + e.Dispose(); + return false; + } + + connState.AddBytesRead(e.BytesTransferred); + var newHead = connState.TryConsumeMessages(e.Buffer); + if (newHead == e.Buffer.Length) + { + // Need to grow input buffer + var newBuffer = new byte[e.Buffer.Length * 2]; + Array.Copy(e.Buffer, newBuffer, e.Buffer.Length); + e.SetBuffer(newBuffer, newHead, newBuffer.Length - newHead); + } + else + e.SetBuffer(newHead, e.Buffer.Length - newHead); + + return true; + } + + private void RecvEventArg_Completed(object sender, SocketAsyncEventArgs e) + { + try + { + var connState = (DarqClientNetworkSession) e.UserToken; + do + { + // No more things to receive + if (!HandleReceiveCompletion(e)) break; + while (pendingMessages.ApproxCount >= maxBuffered) + { + // Wait for processor to keep up + } + } while (!connState.socket.ReceiveAsync(e)); + } + // ignore session socket disposed due to client session dispose + catch (ObjectDisposedException) + { + } + } + } + + public class DarqProcessorClient : IDarqProcessorClient, IDarqProcessorClientCapabilities, IDisposable + { + private string address; + private int port; + + private long incarnation; + private DprSession session; + private DarqProcessorReadClient readClient; + // TODO(Tianyu): May need to make this thread-safe + private DarqProcessorWriteClient writeClient; + + private int maxOutstandingSteps, maxReadBuffer; + + public DarqProcessorClient(string address, int port, int maxOutstandingSteps = 1 << 10, int maxReadBuffer = 1 << 10) + { + this.address = address; + this.port = port; + session = new DprSession(); + this.maxOutstandingSteps = maxOutstandingSteps; + this.maxReadBuffer = maxReadBuffer; + } + + + public ValueTask Step(StepRequest request) + { + throw new NotImplementedException(); + // return new ValueTask(writeClient.Step(request, incarnation, false)); + } + + public DprSession GetDprSession() + { + return session; + } + + /// + public async Task StartProcessingAsync(T processor, CancellationToken token) where T : IDarqProcessor + { + readClient = new DarqProcessorReadClient(session, address, port, maxReadBuffer); + writeClient = new DarqProcessorWriteClient(session, address, port, maxOutstandingSteps); + incarnation = writeClient.RegisterProcessor(); + readClient.StartReceivePush(); + processor.OnRestart(this); + + while (!token.IsCancellationRequested) + { + if (!readClient.pendingMessages.IsEmpty()) + { + var m = readClient.pendingMessages.Dequeue(); + // This is a special rollback signal + if (m.GetLsn() == -1 && m.GetNextLsn() == -1) + { + session = new DprSession(); + readClient = new DarqProcessorReadClient(session, address, port, maxReadBuffer); + writeClient = new DarqProcessorWriteClient(session, address, port, maxOutstandingSteps); + readClient.StartReceivePush(); + processor.OnRestart(this); + continue; + } + + switch (m.GetMessageType()) + { + case DarqMessageType.IN: + case DarqMessageType.RECOVERY: + // TODO(Tianyu): Hacky + if (!processor.ProcessMessage(m)) + { + // TODO(Tianyu): Need to worry about clean shutdown? + writeClient.Flush(); + return; + } + break; + default: + throw new NotImplementedException(); + } + } + writeClient.Flush(); + // Otherwise, just continue looping + } + } + + public void Dispose() + { + readClient.Dispose(); + writeClient.Dispose(); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/client/DarqProducerClient.cs b/cs/research/darq/FASTER.darq/client/DarqProducerClient.cs new file mode 100644 index 000000000..afda6ba9c --- /dev/null +++ b/cs/research/darq/FASTER.darq/client/DarqProducerClient.cs @@ -0,0 +1,424 @@ +using System.Diagnostics; +using System.Net; +using System.Net.Sockets; +using System.Runtime.CompilerServices; +using darq.client; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; +using FASTER.libdpr.proto; + +namespace FASTER.client +{ + /// + /// Encodes information about a DARQ Cluster + /// + public interface IDarqClusterInfo + { + /// + /// Create a new DprFinder instance for the cluster, or null if none is configured + IDprFinder GetNewDprFinder(); + + /// + /// workers present in the cluster and human-readable descriptions + IEnumerable<(DarqId, string)> GetMembers(); + + /// + /// number of workers present in the cluster + int GetClusterSize(); + + /// + /// DarqId of interest + /// Return the IP address and port number the given worker is reachable at + (string, int) GetMemberAddress(DarqId worker); + } + + [Serializable] + public class HardCodedClusterInfo : IDarqClusterInfo + { + private Dictionary memberMap; + private string dprFinderIp; + private int dprFinderPort = -1; + + public HardCodedClusterInfo() + { + memberMap = new Dictionary(); + } + + public HardCodedClusterInfo AddWorker(DarqId worker, string description, string ip, int port) + { + memberMap.Add(worker, (description, ip, port)); + return this; + } + + public HardCodedClusterInfo SetDprFinder(string ip, int port) + { + dprFinderIp = ip; + dprFinderPort = port; + return this; + } + + public (string, int) GetDprFinderInfo() => (dprFinderIp, dprFinderPort); + + public IDprFinder GetNewDprFinder() + { + if (dprFinderPort == -1 || dprFinderIp == null) + throw new FasterException("DprFinder location not set!"); + return new RespGraphDprFinder(dprFinderIp, dprFinderPort); + } + + public (string, int) GetMemberAddress(DarqId worker) + { + var (_, ip, port) = memberMap[worker]; + return (ip, port); + } + + public IEnumerable<(DarqId, string)> GetMembers() => + memberMap.Select(e => (e.Key, e.Value.Item1)); + + public int GetClusterSize() => memberMap.Count; + } + + /// + /// Producer client to add entries to DARQ. Should be invoked single-threaded. + /// + public class DarqProducerClient : IDarqProducer + { + private IDarqClusterInfo darqClusterInfo; + private Dictionary clients; + private DprSession dprSession; + private long serialNum = 0; + + /// + /// Creates a new DarqProducerClient + /// + /// Cluster information + /// The DprClientSession to use for speculative return (default if want return only after commit) + public DarqProducerClient(IDarqClusterInfo darqClusterInfo, DprSession session = null) + { + this.darqClusterInfo = darqClusterInfo; + clients = new Dictionary(); + // TODO(Tianyu): Do something about session to set up SU correctly + dprSession = session ?? new DprSession(); + } + + public void Dispose() + { + foreach (var client in clients.Values) + client.Dispose(); + } + + /// + /// Enqueues a sprint into the DARQ. Task will complete when DARQ has acked the enqueue, or when the enqueue is + /// committed and recoverable if waitCommit is true. + /// + /// ID of the DARQ to enqueue onto + /// body of the sprint + /// producer ID to use (for deduplication purposes), or -1 if none + /// lsn to use (for deduplication purposes), should be monotonically increasing in every producer + /// + /// whether to force flush buffer and send all requests. If false, requests are buffered + /// until a set number has been accumulated or until forced to flush + /// + /// + public Task EnqueueMessageAsync(DarqId darqId, ReadOnlySpan message, long producerId = -1, + long lsn = -1, + bool forceFlush = true) + { + if (!clients.TryGetValue(darqId, out var singleClient)) + { + var (ip, port) = darqClusterInfo.GetMemberAddress(darqId); + singleClient = new SingleDarqProducerClient(dprSession, ip, port); + } + + var task = singleClient.EnqueueMessageAsync(message, producerId, lsn); + + if (forceFlush) + { + foreach (var client in clients.Values) + client.Flush(); + } + + return task; + } + + // TODO(Tianyu): Handle socket-related anomalies? + public void EnqueueMessageWithCallback(DarqId darqId, ReadOnlySpan message, Action callback, + long producerId = -1, long lsn = -1) + { + if (!clients.TryGetValue(darqId, out var singleClient)) + { + var (ip, port) = darqClusterInfo.GetMemberAddress(darqId); + singleClient = new SingleDarqProducerClient(dprSession, ip, port); + } + + singleClient.EnqueueMessageWithCallback(message, producerId, lsn, callback); + } + + + public void ForceFlush() + { + foreach (var client in clients.Values) + client.Flush(); + } + } + + internal class SingleDarqProducerClient : IDisposable, INetworkMessageConsumer + { + private readonly INetworkSender networkSender; + + // TODO(Tianyu): Change to something else for DARQ + private readonly MaxSizeSettings maxSizeSettings; + readonly int bufferSize; + private bool disposed; + private int offset; + private int numMessages; + private const int reservedDprHeaderSpace = 160; + + private DprSession dprSession; + private ElasticCircularBuffer> callbackQueue; + + public SingleDarqProducerClient(DprSession dprSession, string address, int port) + { + this.dprSession = dprSession; + maxSizeSettings = new MaxSizeSettings(); + bufferSize = BufferSizeUtils.ClientBufferSize(maxSizeSettings); + + networkSender = new TcpNetworkSender(GetSendSocket(address, port), maxSizeSettings); + networkSender.GetResponseObject(); + offset = 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size; + numMessages = 0; + + callbackQueue = new ElasticCircularBuffer>(); + } + + public void Dispose() + { + disposed = true; + networkSender.Dispose(); + } + + internal unsafe void Flush() + { + try + { + if (offset > 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size) + { + var head = networkSender.GetResponseObjectHead(); + // Set packet size in header + *(int*)head = -(offset - sizeof(int)); + head += sizeof(int); + + ((BatchHeader*)head)->SetNumMessagesProtocol(numMessages, + (WireFormat)DarqProtocolType.DarqProducer); + head += sizeof(BatchHeader); + + // Set DprHeader size + *(int*)head = reservedDprHeaderSpace; + head += sizeof(int); + + // populate DPR header + var headerBytes = new Span(head, reservedDprHeaderSpace); + if (dprSession.TagMessage(headerBytes) < 0) + // TODO(Tianyu): Handle size mismatch by probably copying into a new array and up-ing reserved space in the future + throw new NotImplementedException(); + if (!networkSender.SendResponse(0, offset)) + throw new ObjectDisposedException("socket closed"); + + networkSender.GetResponseObject(); + offset = 2 * sizeof(int) + reservedDprHeaderSpace + BatchHeader.Size; + numMessages = 0; + } + } + catch (DprSessionRolledBackException) + { + // Ensure that callback queue is drained only on a single-thread. This is not a scalability issue + // because except in the event of a rollback, callback queue is not concurrently accessed + lock (callbackQueue) + { + // TODO(Tianyu): Eagerly clearing the callback queue here may result in an overapproximation of + // things that are rolled back, but is the expedient approach here. Maybe fix later + while (!callbackQueue.IsEmpty()) + callbackQueue.Dequeue()(false); + } + + throw; + } + } + + public Task EnqueueMessageAsync(ReadOnlySpan message, long producerId, long lsn) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + EnqueueMessageWithCallback(message, producerId, lsn, success => + { + if (success) + tcs.SetResult(null); + else + tcs.SetCanceled(); + }); + return tcs.Task; + } + + public void EnqueueMessageWithCallback(ReadOnlySpan message, long producerId, long lsn, + Action callback) + { + EnqueueMessageInternal(message, producerId, lsn, callback); + } + + internal unsafe void EnqueueMessageInternal(ReadOnlySpan message, long producerId, long lsn, + Action action) + { + byte* curr, end; + var entryBatchSize = SerializedDarqEntryBatch.ComputeSerializedSize(message); + while (true) + { + end = networkSender.GetResponseObjectHead() + bufferSize; + curr = networkSender.GetResponseObjectHead() + offset; + var serializedSize = sizeof(byte) + sizeof(long) * 2 + entryBatchSize; + if (end - curr >= serializedSize) break; + Flush(); + } + + *curr = (byte)DarqCommandType.DarqEnqueue; + curr += sizeof(byte); + + *(long*)curr = producerId; + curr += sizeof(long); + *(long*)curr = lsn; + curr += sizeof(long); + + var batch = new SerializedDarqEntryBatch(curr); + batch.SetContent(message); + curr += entryBatchSize; + offset = (int)(curr - networkSender.GetResponseObjectHead()); + numMessages++; + callbackQueue.Enqueue(action); + } + + unsafe void INetworkMessageConsumer.ProcessReplies(byte[] buf, int startOffset, int size) + { + fixed (byte* b = buf) + { + var src = b + startOffset; + + var count = ((BatchHeader*)src)->NumMessages; + src += BatchHeader.Size; + + var dprHeader = new ReadOnlySpan(src, DprMessageHeader.FixedLenSize); + src += DprMessageHeader.FixedLenSize; + + // Ensure that callback queue is drained only on a single-thread. This is not a scalability issue + // because except in the event of a rollback, callback queue is not concurrently accessed + lock (callbackQueue) + { + try + { + if (!dprSession.Receive(dprHeader)) return; + for (int i = 0; i < count; i++) + { + var messageType = (DarqCommandType)(*src++); + switch (messageType) + { + case DarqCommandType.DarqEnqueue: + callbackQueue.Dequeue()(true); + break; + // Even though the server could return DarqCommandType.INVALID, this should not get + // past the Receive() call which triggers the rollback codepath + default: + throw new FasterException("Unexpected return type"); + } + } + } + catch (DprSessionRolledBackException) + { + // TODO(Tianyu): Eagerly clearing the callback queue here may result in an overapproximation of + // things that are rolled back, but is the expedient approach here. Maybe fix later + while (!callbackQueue.IsEmpty()) + callbackQueue.Dequeue()(false); + } + } + } + } + + private Socket GetSendSocket(string address, int port, int millisecondsTimeout = -2) + { + var ip = IPAddress.Parse(address); + var endPoint = new IPEndPoint(ip, port); + var socket = new Socket(ip.AddressFamily, SocketType.Stream, ProtocolType.Tcp) + { + NoDelay = true + }; + + if (millisecondsTimeout != -2) + { + IAsyncResult result = socket.BeginConnect(endPoint, null, null); + result.AsyncWaitHandle.WaitOne(millisecondsTimeout, true); + if (socket.Connected) + socket.EndConnect(result); + else + { + socket.Close(); + throw new Exception("Failed to connect server."); + } + } + else + { + socket.Connect(endPoint); + } + + // Ok to create new event args on accept because we assume a connection to be long-running + var receiveEventArgs = new SocketAsyncEventArgs(); + var bufferSize = BufferSizeUtils.ServerBufferSize(maxSizeSettings); + receiveEventArgs.SetBuffer(new byte[bufferSize], 0, bufferSize); + receiveEventArgs.UserToken = new DarqClientNetworkSession(socket, this); + receiveEventArgs.Completed += RecvEventArg_Completed; + var response = socket.ReceiveAsync(receiveEventArgs); + Debug.Assert(response); + return socket; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool HandleReceiveCompletion(SocketAsyncEventArgs e) + { + var connState = (DarqClientNetworkSession)e.UserToken; + if (e.BytesTransferred == 0 || e.SocketError != SocketError.Success || disposed) + { + connState.socket.Dispose(); + e.Dispose(); + return false; + } + + connState.AddBytesRead(e.BytesTransferred); + var newHead = connState.TryConsumeMessages(e.Buffer); + if (newHead == e.Buffer.Length) + { + // Need to grow input buffer + var newBuffer = new byte[e.Buffer.Length * 2]; + Array.Copy(e.Buffer, newBuffer, e.Buffer.Length); + e.SetBuffer(newBuffer, newHead, newBuffer.Length - newHead); + } + else + e.SetBuffer(newHead, e.Buffer.Length - newHead); + + return true; + } + + private void RecvEventArg_Completed(object sender, SocketAsyncEventArgs e) + { + try + { + var connState = (DarqClientNetworkSession)e.UserToken; + do + { + // No more things to receive + if (!HandleReceiveCompletion(e)) break; + } while (!connState.socket.ReceiveAsync(e)); + } + // ignore session socket disposed due to client session dispose + catch (ObjectDisposedException) + { + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/gRPC/DarqGrpc.cs b/cs/research/darq/FASTER.darq/gRPC/DarqGrpc.cs new file mode 100644 index 000000000..805862b2f --- /dev/null +++ b/cs/research/darq/FASTER.darq/gRPC/DarqGrpc.cs @@ -0,0 +1,216 @@ +using System.Collections.Concurrent; +using darq.client; +using FASTER.client; +using FASTER.common; +using FASTER.darq; +using FASTER.libdpr; +using FASTER.libdpr.gRPC; +using Google.Protobuf; +using Grpc.Core; +using Grpc.Core.Interceptors; +using Grpc.Net.Client; +using protobuf; +using DarqMessageType = FASTER.libdpr.DarqMessageType; + +namespace darq.gRPC; + +public class DarqGrpcProducerWrapper : IDarqProducer +{ + private Dictionary clusterMap; + private ConcurrentDictionary clients = new(); + private DprSession session; + + public DarqGrpcProducerWrapper(Dictionary clusterMap, DprSession session) + { + this.clusterMap = clusterMap; + this.session = session; + } + + public void Dispose() {} + + public void EnqueueMessageWithCallback(DarqId darqId, ReadOnlySpan message, Action callback, long producerId, long lsn) + { + var client = clients.GetOrAdd(darqId, + _ => new DarqGrpcService.DarqGrpcServiceClient(clusterMap[darqId] + .Intercept(new DprClientInterceptor(session)))); + var enqueueRequest = new DarqEnqueueRequest + { + Message = ByteString.CopyFrom(message), + ProducerId = producerId, + Lsn = lsn + }; + Task.Run(async () => + { + try + { + await client.EnqueueAsync(enqueueRequest); + callback(true); + } + catch + { + callback(false); + throw; + } + }); + } + + public void ForceFlush() + { + // TODO(Tianyu): Not implemented for now + } +} + +// TODO(Tianyu): Fix later to be a background service for correct initialization +public class DarqGrpcServiceImpl : DarqGrpcService.DarqGrpcServiceBase, IDisposable +{ + private Darq backend; + private readonly CancellationTokenSource cts; + private readonly ManualResetEventSlim terminationStart; + private readonly CountdownEvent terminationComplete; + private Thread refreshThread; + + private ThreadLocalObjectPool stepRequestPool; + private ThreadLocalObjectPool enqueueRequestPool; + + private long currentIncarnationId; + private DarqScanIterator currentIterator; + + public DarqGrpcServiceImpl(Darq darq) + { + backend = darq; + terminationStart = new ManualResetEventSlim(); + terminationComplete = new CountdownEvent(2); + stepRequestPool = new ThreadLocalObjectPool(() => new StepRequest()); + enqueueRequestPool = new ThreadLocalObjectPool(() => new byte[1 << 15]); + backend.ConnectToCluster(out _); + cts = new CancellationTokenSource(); + + refreshThread = new Thread(() => + { + while (!terminationStart.IsSet) + backend.Refresh(); + terminationComplete.Signal(); + }); + refreshThread.Start(); + } + + public void Dispose() + { + terminationStart.Set(); + // TODO(Tianyu): this shutdown process is unsafe and may leave things unsent/unprocessed in the queue + backend.ForceCheckpoint(); + cts.Cancel(); + Thread.Sleep(2000); + terminationComplete.Wait(); + backend.Dispose(); + refreshThread.Join(); + } + + public Darq GetDarq() => backend; + + public override async Task RegisterProcessor(RegisterProcessorRequest request, + ServerCallContext context) + { + var result = await backend.RegisterNewProcessorAsync(); + // Not a serial bottleneck as we don't expect this to be invoked under high concurrency + lock (this) + { + currentIncarnationId = result; + currentIterator = backend.StartScan(true); + } + return new RegisterProcessorResult + { + IncarnationId = result + }; + } + + public override Task Step(DarqStepRequest request, ServerCallContext context) + { + var requestObject = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(requestObject); + foreach (var consumed in request.ConsumedMessages) + requestBuilder.MarkMessageConsumed(consumed); + foreach (var self in request.SelfMessages) + requestBuilder.AddRecoveryMessage(self.Span); + foreach (var outMessage in request.OutMessages) + requestBuilder.AddOutMessage(new DarqId(outMessage.Recipient), outMessage.Message.Span); + var result = backend.Step(request.IncarnationId, requestBuilder.FinishStep()); + stepRequestPool.Return(requestObject); + + return Task.FromResult(new DarqStepResult + { + Status = result switch + { + // Should never happen + StepStatus.INCOMPLETE => throw new NotImplementedException(), + StepStatus.SUCCESS => DarqStepStatus.Success, + StepStatus.INVALID => DarqStepStatus.Invalid, + StepStatus.REINCARNATED => DarqStepStatus.Reincarnated, + _ => throw new ArgumentOutOfRangeException() + } + }); + } + + public override Task Enqueue(DarqEnqueueRequest request, ServerCallContext context) + { + var enqueueBuffer = enqueueRequestPool.Checkout(); + SerializedDarqEntryBatch enqueueRequest; + unsafe + { + fixed (byte* b = enqueueBuffer) + { + enqueueRequest = new SerializedDarqEntryBatch(b); + enqueueRequest.SetContent(request.Message.Span); + } + } + + var ok = backend.Enqueue(enqueueRequest, request.ProducerId, request.Lsn); + enqueueRequestPool.Return(enqueueBuffer); + return Task.FromResult(new DarqEnqueueResult + { + Ok = ok + }); + } + + public override Task Poll(DarqPollRequest request, ServerCallContext context) + { + // Not a serial bottleneck as we don't expect this to be invoked under high concurrency + lock (this) + { + // Not able to poll if you are not the recognized consumer + if (currentIncarnationId != request.IncarnationId) + return Task.FromResult(new DarqPollResult + { + Ok = false + }); + + var result = new DarqPollResult { Ok = true }; + unsafe + { + for (var i = 0; i < request.MaxBatchSize; i++) + { + if (!currentIterator.UnsafeGetNext(out var b, out var length, out _, out _, out var type)) + break; + if (type is DarqMessageType.IN or DarqMessageType.RECOVERY) + { + var darqMessage = new protobuf.DarqMessage + { + Type = type switch + { + DarqMessageType.IN => protobuf.DarqMessageType.In, + DarqMessageType.RECOVERY => protobuf.DarqMessageType.Self, + _ => throw new ArgumentOutOfRangeException() + }, + MesssageBody = ByteString.CopyFrom(new ReadOnlySpan(b, length)) + }; + result.Messages.Add(darqMessage); + } + + currentIterator.UnsafeRelease(); + } + } + return Task.FromResult(result); + + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/gRPC/proto/darq.proto b/cs/research/darq/FASTER.darq/gRPC/proto/darq.proto new file mode 100644 index 000000000..f82c5ced9 --- /dev/null +++ b/cs/research/darq/FASTER.darq/gRPC/proto/darq.proto @@ -0,0 +1,69 @@ +syntax = "proto3"; +option csharp_namespace = "protobuf"; + + +service DarqGrpcService { + rpc RegisterProcessor(RegisterProcessorRequest) returns (RegisterProcessorResult); + rpc Step(DarqStepRequest) returns (DarqStepResult); + rpc Enqueue(DarqEnqueueRequest) returns (DarqEnqueueResult); + rpc Poll(DarqPollRequest) returns (DarqPollResult); +} + +message RegisterProcessorRequest { +} + +message RegisterProcessorResult { + int64 incarnationId = 1; +} + +message DarqOutMessage { + int64 recipient = 1; + bytes message = 2; +} + +message DarqStepRequest { + int64 incarnationId = 1; + repeated int64 consumedMessages = 2; + repeated DarqOutMessage outMessages = 3; + repeated bytes selfMessages = 4; +} + +enum DarqStepStatus { + SUCCESS = 0; + INVALID = 1; + REINCARNATED = 2; +} + +message DarqStepResult { + DarqStepStatus status = 1; +} + +message DarqEnqueueRequest { + bytes message = 1; + int64 producerId = 2; + int64 lsn = 3; +} + +message DarqEnqueueResult { + bool ok = 1; +} + +message DarqPollRequest { + int64 incarnationId = 1; + int32 maxBatchSize = 2; +} + +enum DarqMessageType { + IN = 0; + SELF = 2; +} + +message DarqMessage { + DarqMessageType type = 1; + bytes messsageBody = 2; +} + +message DarqPollResult { + bool ok = 1; + repeated DarqMessage messages = 2; +} diff --git a/cs/research/darq/FASTER.darq/server/DarqProcessorSession.cs b/cs/research/darq/FASTER.darq/server/DarqProcessorSession.cs new file mode 100644 index 000000000..309efd27f --- /dev/null +++ b/cs/research/darq/FASTER.darq/server/DarqProcessorSession.cs @@ -0,0 +1,144 @@ +using System.Runtime.CompilerServices; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; + +namespace FASTER.server +{ + internal sealed unsafe class DarqProcessorSession : ServerSessionBase where TVersionScheme : IVersionScheme + { + readonly HeaderReaderWriter hrw; + int readHead; + int seqNo, msgnum, start; + private Darq darq; + + public DarqProcessorSession(INetworkSender networkSender, Darq darq) : base(networkSender) + { + this.darq = darq; + } + + public override int TryConsumeMessages(byte* req_buf, int bytesReceived) + { + bytesRead = bytesReceived; + readHead = 0; + while (TryReadMessages(req_buf, out var offset)) + ProcessBatch(req_buf, offset); + return readHead; + } + + private bool TryReadMessages(byte* buf, out int offset) + { + offset = default; + + var bytesAvailable = bytesRead - readHead; + // Need to at least have read off of size field on the message + if (bytesAvailable < sizeof(int)) return false; + + // MSB is 1 to indicate binary protocol + var size = -(*(int*)(buf + readHead)); + + // Not all of the message has arrived + if (bytesAvailable < size + sizeof(int)) return false; + offset = readHead + sizeof(int); + + // Consume this message and the header + readHead += size + sizeof(int); + return true; + } + + private void ProcessBatch(byte* buf, int offset) + { + var d = networkSender.GetResponseObjectHead(); + byte* b = buf + offset; + var dend = networkSender.GetResponseObjectTail(); + var dcurr = d + sizeof(int); // reserve space for size + + var src = b; + ref var header = ref Unsafe.AsRef(src); + var num = header.NumMessages; + src += BatchHeader.Size; + dcurr += BatchHeader.Size; + + var dprResponseOffset = (int*)dcurr; + dcurr += DprMessageHeader.FixedLenSize; + start = 0; + msgnum = 0; + + var dprHeaderSize = *(int*)src; + src += sizeof(int); + var request = new ReadOnlySpan(src, dprHeaderSize); + src += dprHeaderSize; + // Error code path + if (!darq.TryReceiveAndStartAction(request)) + { + for (msgnum = 0; msgnum < num; msgnum++) + hrw.Write((byte)DarqCommandType.INVALID, ref dcurr, (int)(dend - dcurr)); + // Can immediately send DPR error version regardless of version or status + } + else + { + for (msgnum = 0; msgnum < num; msgnum++) + { + var message = (DarqCommandType)(*src++); + switch (message) + { + case DarqCommandType.DarqStep: + { + var processorId = *(long*)src; + src += sizeof(long); + + var batch = new SerializedDarqEntryBatch(src); + var response = darq.Step(processorId, batch); + hrw.Write((byte) message, ref dcurr, (int)(dend - dcurr)); + *(StepStatus*)dcurr = response; + dcurr += sizeof(StepStatus); + break; + } + case DarqCommandType.DarqRegisterProcessor: + { + var consumerId = darq.RegisterNewProcessor(); + hrw.Write((byte) message, ref dcurr, (int)(dend - dcurr)); + *(long*)dcurr = consumerId; + dcurr += sizeof(long); + break; + } + default: + throw new NotImplementedException(); + } + } + } + + darq.ProduceTagAndEndAction(new Span(dprResponseOffset, DprMessageHeader.FixedLenSize)); + // Send replies + Send(d, dcurr); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Send(byte* d, byte* dcurr) + { + var dstart = d + sizeof(int); + ((BatchHeader*)dstart)->SetNumMessagesProtocol(msgnum - start, (WireFormat) DarqProtocolType.DarqProcessor); + ((BatchHeader*)dstart)->SeqNo = seqNo++; + int payloadSize = (int)(dcurr - d); + // Set packet size in header + *(int*)networkSender.GetResponseObjectHead() = -(payloadSize - sizeof(int)); + if (!networkSender.SendResponse(0, payloadSize)) + throw new ObjectDisposedException("socket closed"); + } + + + public override void Publish(ref byte* keyPtr, int keyLength, ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + + public override void PrefixPublish(byte* prefixPtr, int prefixLength, ref byte* keyPtr, int keyLength, + ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/server/DarqProducerSession.cs b/cs/research/darq/FASTER.darq/server/DarqProducerSession.cs new file mode 100644 index 000000000..015ba97f9 --- /dev/null +++ b/cs/research/darq/FASTER.darq/server/DarqProducerSession.cs @@ -0,0 +1,167 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; + +namespace FASTER.server +{ + internal class ProducerResponseBuffer : IDisposable + { + internal long version; + internal byte[] buf; + internal int size; + internal INetworkSender networkSender; + internal SimpleObjectPool pool; + + public ProducerResponseBuffer(int bufferSize, INetworkSender networkSender, + SimpleObjectPool pool) + { + buf = new byte[bufferSize]; + this.pool = pool; + this.networkSender = networkSender; + } + + public void Dispose() => pool?.Return(this); + } + + internal sealed unsafe class DarqProducerSession : ServerSessionBase where TVersionScheme : IVersionScheme + { + readonly HeaderReaderWriter hrw; + int readHead; + int seqNo, msgnum, start; + private Darq darq; + private readonly SimpleObjectPool sendBufferPool; + private ConcurrentQueue responseQueue; + + public DarqProducerSession(INetworkSender networkSender, Darq darq, + ConcurrentQueue responseQueue) : base( + networkSender) + { + this.darq = darq; + var size = BufferSizeUtils.ServerBufferSize(networkSender.GetMaxSizeSettings); + sendBufferPool = new SimpleObjectPool(() => + new ProducerResponseBuffer(size, this.networkSender, sendBufferPool)); + this.responseQueue = responseQueue; + } + + public override int TryConsumeMessages(byte* req_buf, int bytesReceived) + { + bytesRead = bytesReceived; + readHead = 0; + while (TryReadMessages(req_buf, out var offset)) + ProcessBatch(req_buf, offset); + return readHead; + } + + private bool TryReadMessages(byte* buf, out int offset) + { + offset = default; + + var bytesAvailable = bytesRead - readHead; + // Need to at least have read off of size field on the message + if (bytesAvailable < sizeof(int)) return false; + + // MSB is 1 to indicate binary protocol + var size = -(*(int*)(buf + readHead)); + + // Not all of the message has arrived + if (bytesAvailable < size + sizeof(int)) return false; + offset = readHead + sizeof(int); + + // Consume this message and the header + readHead += size + sizeof(int); + return true; + } + + private void ProcessBatch(byte* buf, int offset) + { + var response = sendBufferPool.Checkout(); + byte* b = buf + offset; + fixed (byte* d = response.buf) + { + var dend = d + response.buf.Length; + var dcurr = d + sizeof(int); // reserve space for size + + var src = b; + ref var header = ref Unsafe.AsRef(src); + var num = header.NumMessages; + src += BatchHeader.Size; + dcurr += BatchHeader.Size; + + var dprResponseOffset = dcurr; + dcurr += DprMessageHeader.FixedLenSize; + start = 0; + msgnum = 0; + + var dprHeaderSize = *(int*)src; + src += sizeof(int); + var request = new ReadOnlySpan(src, dprHeaderSize); + src += dprHeaderSize; + if (!darq.TryReceiveAndStartAction(request)) + { + for (msgnum = 0; msgnum < num; msgnum++) + hrw.Write((byte)DarqCommandType.INVALID, ref dcurr, (int)(dend - dcurr)); + // Can immediately send DPR error version regardless of version or status + response.version = 0; + } + else + { + for (msgnum = 0; msgnum < num; msgnum++) + { + var message = (DarqCommandType)(*src++); + Debug.Assert(message == DarqCommandType.DarqEnqueue); + var producer = *(long*)src; + src += sizeof(long); + var lsn = *(long*)src; + src += sizeof(long); + var batch = new SerializedDarqEntryBatch(src); + + darq.Enqueue(batch, producer, lsn); + src += batch.TotalSize(); + hrw.Write((byte)message, ref dcurr, (int)(dend - dcurr)); + } + + response.version = darq.Version(); + } + + + darq.ProduceTagAndEndAction(new Span(dprResponseOffset, DprMessageHeader.FixedLenSize)); + SendResponseBuffer(d, dcurr, response); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void SendResponseBuffer(byte* d, byte* dcurr, ProducerResponseBuffer response) + { + var dstart = d + sizeof(int); + ((BatchHeader*)dstart)->SetNumMessagesProtocol(msgnum - start, (WireFormat)DarqProtocolType.DarqProducer); + ((BatchHeader*)dstart)->SeqNo = seqNo++; + var payloadSize = response.size = (int)(dcurr - d); + // Set packet size in header + *(int*)d = -(payloadSize - sizeof(int)); + + if (responseQueue == null || response.version >= darq.CommittedVersion()) + // TODO(Tianyu): Figure out how to handle errors + networkSender.SendResponse(response.buf, 0, payloadSize, response.Dispose); + else + responseQueue.Enqueue(response); + } + + + public override void Publish(ref byte* keyPtr, int keyLength, ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + + public override void PrefixPublish(byte* prefixPtr, int prefixLength, ref byte* keyPtr, int keyLength, + ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/server/DarqServer.cs b/cs/research/darq/FASTER.darq/server/DarqServer.cs new file mode 100644 index 000000000..ce89408a8 --- /dev/null +++ b/cs/research/darq/FASTER.darq/server/DarqServer.cs @@ -0,0 +1,135 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using darq; +using darq.client; +using FASTER.client; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; + +namespace FASTER.server +{ + public class DarqProvider : ISessionProvider where TVersionScheme : IVersionScheme + { + private Darq backend; + private ConcurrentQueue responseQueue; + + internal DarqProvider(Darq backend, ConcurrentQueue responseQueue) + { + this.backend = backend; + GetMaxSizeSettings = new MaxSizeSettings(); + this.responseQueue = responseQueue; + } + + public IMessageConsumer GetSession(WireFormat wireFormat, INetworkSender networkSender) + { + switch ((DarqProtocolType) wireFormat) + { + case DarqProtocolType.DarqSubscribe: + return new DarqSubscriptionSession(networkSender, backend); + case DarqProtocolType.DarqProducer: + return new DarqProducerSession(networkSender, backend, responseQueue); + case DarqProtocolType.DarqProcessor: + return new DarqProcessorSession(networkSender, backend); + default: + throw new NotSupportedException(); + } + } + + public MaxSizeSettings GetMaxSizeSettings { get; } + } + + public class DarqServer : IDisposable where TVersionScheme : IVersionScheme + { + private readonly IFasterServer server; + private readonly Darq darq; + private readonly DarqProvider provider; + private readonly DarqBackgroundMaintenanceTask maintenanceTask; + private readonly CancellationTokenSource cts; + private readonly ManualResetEventSlim terminationStart; + private readonly CountdownEvent terminationComplete; + private Thread refreshThread, responseThread; + private ConcurrentQueue responseQueue; + + public DarqServer(DarqServerOptions options, TVersionScheme versionScheme) + { + darq = new Darq(options.DarqSettings, versionScheme); + // tODO(Tianyu): Broken + // maintenanceTask = new DarqBackgroundMaintenanceTask(darq, new DarqMaintenanceBackgroundServiceSettings + // { + // morselSize = 512, + // batchSize = 06, + // producerFactory = session => new DarqProducerClient(options.ClusterInfo, session) + // }); + cts = new CancellationTokenSource(); + terminationStart = new ManualResetEventSlim(); + terminationComplete = new CountdownEvent(2); + darq.ConnectToCluster(out _); + responseQueue = new ConcurrentQueue(); + provider = new DarqProvider(darq, responseQueue); + server = new FasterServerTcp(options.Address, options.Port); + // Check that our custom defined wire format is not clashing with anything implemented by FASTER + Debug.Assert(!Enum.IsDefined(typeof(WireFormat), (WireFormat) (int) DarqProtocolType.DarqSubscribe)); + Debug.Assert(!Enum.IsDefined(typeof(WireFormat), (WireFormat) (int)DarqProtocolType.DarqProcessor)); + Debug.Assert(!Enum.IsDefined(typeof(WireFormat), (WireFormat) (int)DarqProtocolType.DarqProducer)); + + server.Register((WireFormat) DarqProtocolType.DarqSubscribe, provider); + server.Register((WireFormat) DarqProtocolType.DarqProcessor, provider); + server.Register((WireFormat) DarqProtocolType.DarqProducer, provider); + } + + public Darq GetDarq() => darq; + + public long BackgroundProcessingLag => maintenanceTask.ProcessingLag; + + public void Start() + { + server.Start(); + Task.Run(async () => await maintenanceTask.RunAsync(cts.Token)); + + refreshThread = new Thread(() => + { + while (!terminationStart.IsSet) + darq.Refresh(); + terminationComplete.Signal(); + }); + refreshThread.Start(); + + responseThread = new Thread(async () => + { + while (!terminationStart.IsSet && responseQueue != null && !responseQueue.IsEmpty) + { + // TODO(Tianyu): current implementation may have response buffers in the queue with versions + // out-of-order, resulting in some responses getting sent later than necessary + while (responseQueue.TryPeek(out var response)) + { + if (response.version <= darq.CommittedVersion()) + // TODO(Tianyu): Figure out how to handle errors + response.networkSender.SendResponse(response.buf, 0, response.size, response.Dispose); + responseQueue.TryDequeue(out _); + } + + await darq.NextCommit(); + } + + terminationComplete.Signal(); + }); + responseThread.Start(); + } + + public void Dispose() + { + terminationStart.Set(); + // TODO(Tianyu): this shutdown process is unsafe and may leave things unsent/unprocessed in the queue + darq.ForceCheckpoint(); + cts.Cancel(); + Thread.Sleep(2000); + maintenanceTask?.Dispose(); + server.Dispose(); + terminationComplete.Wait(); + darq.Dispose(); + refreshThread.Join(); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/server/DarqServerOptions.cs b/cs/research/darq/FASTER.darq/server/DarqServerOptions.cs new file mode 100644 index 000000000..c52a0d0e3 --- /dev/null +++ b/cs/research/darq/FASTER.darq/server/DarqServerOptions.cs @@ -0,0 +1,26 @@ +using darq; +using FASTER.client; +using FASTER.darq; + +namespace FASTER.server +{ + /// + /// Options when creating DARQ server + /// + public class DarqServerOptions + { + /// + /// Port to run server on. + /// + public int Port = 3278; + + /// + /// IP address to bind server to. + /// + public string Address = "127.0.0.1"; + + public IDarqClusterInfo ClusterInfo; + + public DarqSettings DarqSettings; + } +} \ No newline at end of file diff --git a/cs/research/darq/FASTER.darq/server/DarqSubscriptionSession.cs b/cs/research/darq/FASTER.darq/server/DarqSubscriptionSession.cs new file mode 100644 index 000000000..4eda69c55 --- /dev/null +++ b/cs/research/darq/FASTER.darq/server/DarqSubscriptionSession.cs @@ -0,0 +1,245 @@ +using System.Diagnostics; +using System.Runtime.CompilerServices; +using FASTER.common; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; + +namespace FASTER.server +{ + internal sealed class DarqSubscriptionSession : ServerSessionBase where TVersionScheme : IVersionScheme + { + readonly HeaderReaderWriter hrw; + int readHead; + int seqNo, msgnum, start; + private Darq dprServer; + private ManualResetEventSlim terminationStart, terminationComplete; + private Thread pushThread; + private unsafe byte* dcurr, dend; + private unsafe int *dprResponseOffset; + // TODO(Tianyu): Hacky + private static int MAX_BATCH_SIZE = 1 << 10; + // TODO(Tianyu): Hacky + private byte[] tempBuffer = new byte[1 << 12]; + + + public DarqSubscriptionSession(INetworkSender networkSender, Darq dprServer) : base(networkSender) + { + this.dprServer = dprServer; + } + + private unsafe bool TryReadMessages(byte* buf, out int offset) + { + offset = default; + + var bytesAvailable = bytesRead - readHead; + // Need to at least have read off of size field on the message + if (bytesAvailable < sizeof(int)) return false; + + // MSB is 1 to indicate binary protocol + var size = -(*(int*) buf); + + // Not all of the message has arrived + if (bytesAvailable < size + sizeof(int)) return false; + offset = readHead + sizeof(int); + + // Consume this message and the header + readHead += size + sizeof(int); + return true; + } + + public override void Dispose() + { + terminationStart.Set(); + terminationComplete.Wait(); + pushThread?.Join(); + base.Dispose(); + } + + public override unsafe int TryConsumeMessages(byte* req_buf, int bytesReceived) + { + bytesRead = bytesReceived; + readHead = 0; + while (TryReadMessages(req_buf, out var offset)) + ProcessBatch(req_buf, offset); + return readHead; + } + + private unsafe void ProcessBatch(byte* buf, int offset) + { + var src = buf + offset; + ref var header = ref Unsafe.AsRef(src); + var num = header.NumMessages; + src += BatchHeader.Size; + for (msgnum = 0; msgnum < num; msgnum++) + { + var m = (DarqCommandType) (*src++); + switch (m) + { + case DarqCommandType.DarqStartPush: + { + // TODO(Tianyu): Wots dis? + var speculative = (*src++) == 1; + var t = new ManualResetEventSlim(); + if (Interlocked.CompareExchange(ref terminationStart, t, null) == null) + { + terminationComplete = new ManualResetEventSlim(); + StartPushEntries(); + } + break; + } + default: + throw new NotImplementedException(); + } + } + } + + // TODO(Tianyu): SU integration needs to happen here + private void StartPushEntries() + { + pushThread = new Thread(async () => + { + using var it = dprServer.StartScan(true); + while (!terminationStart.IsSet) + { + ResetSendBuffer(); + while (TrySendEntry(it) && msgnum < MAX_BATCH_SIZE) {} + if (msgnum != 0) + SendCurrentBuffer(); + else + dprServer.EndAction(); + + if (terminationStart.IsSet) break; + + // dprServer.StateObject().RefreshSafeReadTail(); + var iteratorWait = it.WaitAsync().AsTask(); + if (await Task.WhenAny(iteratorWait, Task.Delay(10)) == iteratorWait) + { + // No more entries, can signal finished and return + if (!iteratorWait.Result) break; + } + // Otherwise, just continue looping + } + + terminationComplete.Set(); + }); + pushThread.Start(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void ResetSendBuffer() + { + msgnum = 0; + networkSender.GetResponseObject(); + var d = networkSender.GetResponseObjectHead(); + dend = networkSender.GetResponseObjectTail(); + dcurr = d + sizeof(int); // reserve space for size + dcurr += BatchHeader.Size; + dprResponseOffset = (int*) dcurr; + dcurr += sizeof(int); + dprServer.StartLocalAction(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void SendCurrentBuffer() + { + // Finish composing DPR batch + *dprResponseOffset = (int) (dcurr - (byte*) dprResponseOffset) - sizeof(int); + // reserve a size field for DPR header; + var dprHeaderSizeField = (int*) dcurr; + dcurr += sizeof(int); + dprServer.ProduceTagAndEndAction(new Span(dcurr, (int) (dend - dcurr))); + dcurr += DprMessageHeader.FixedLenSize; + // Write size + *dprHeaderSizeField = DprMessageHeader.FixedLenSize; + Debug.Assert(dcurr < dend); + + var d = networkSender.GetResponseObjectHead(); + var dstart = d + sizeof(int); + ((BatchHeader*) dstart)->SetNumMessagesProtocol(msgnum - start, (WireFormat) DarqProtocolType.DarqSubscribe); + ((BatchHeader*) dstart)->SeqNo = seqNo++; + int payloadSize = (int) (dcurr - d); + // Set packet size in header + *(int*) d = -(payloadSize - sizeof(int)); + networkSender.SendResponse(0, payloadSize); + } + + // TODO(Tianyu): More efficiently batch entries together once we figure out how to get size computation to work + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe bool TrySendEntry(DarqScanIterator it) + { + if (it.UnsafeGetNext(out var entry, out var entryLength, out var lsn, out var nextLsn, out var type)) + { + + if (type != DarqMessageType.IN && type != DarqMessageType.RECOVERY) + { + it.UnsafeRelease(); + return true; + } + + var spaceRequired = 2 * sizeof(long) + sizeof(int) + sizeof(MessageType) + entryLength; + // TODO(Tianyu): hacky --- we are not supposed to know about header size details as that's implementation specific + var dprHeaderSpace = (1 + msgnum) * sizeof(long) + DprMessageHeader.FixedLenSize + sizeof(int); + + if (dend - dcurr < spaceRequired + dprHeaderSpace) + { + // TODO(Tianyu): Not very elegant --- either don't use the unsafe interface here or do something else? + // Must release iterator epoch as we may block on the send. Copy the entry before doing that + if (tempBuffer.Length < entryLength) tempBuffer = new byte[entryLength]; + new Span(entry, entryLength).CopyTo(tempBuffer); + it.UnsafeRelease(); + + SendCurrentBuffer(); + ResetSendBuffer(); + + msgnum++; + *(long*) dcurr = lsn; + dcurr += sizeof(long); + *(long*) dcurr = nextLsn; + dcurr += sizeof(long); + *(DarqMessageType*) dcurr = type; + dcurr += sizeof(DarqMessageType); + *(int*) dcurr = entryLength; + dcurr += sizeof(int); + fixed (byte *b = tempBuffer) + Buffer.MemoryCopy(b, dcurr, entryLength, entryLength); + dcurr += entryLength; + Debug.Assert(dcurr < dend); + } + else + { + msgnum++; + *(long*) dcurr = lsn; + dcurr += sizeof(long); + *(long*) dcurr = nextLsn; + dcurr += sizeof(long); + *(DarqMessageType*) dcurr = type; + dcurr += sizeof(DarqMessageType); + *(int*) dcurr = entryLength; + dcurr += sizeof(int); + Buffer.MemoryCopy(entry, dcurr, entryLength, entryLength); + it.UnsafeRelease(); + dcurr += entryLength; + Debug.Assert(dcurr < dend); + } + + return true; + } + return false; + } + + + public override unsafe void Publish(ref byte* keyPtr, int keyLength, ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + + public override unsafe void PrefixPublish(byte* prefixPtr, int prefixLength, ref byte* keyPtr, int keyLength, + ref byte* valPtr, int valLength, + ref byte* inputPtr, int sid) + { + throw new System.NotImplementedException(); + } + } +} \ No newline at end of file diff --git a/cs/research/darq/SpFasterMicrobench/NonDseFasterService.cs b/cs/research/darq/SpFasterMicrobench/NonDseFasterService.cs new file mode 100644 index 000000000..562f7ab0a --- /dev/null +++ b/cs/research/darq/SpFasterMicrobench/NonDseFasterService.cs @@ -0,0 +1,211 @@ +using dse.services; +using FASTER.common; +using FASTER.core; +using Grpc.Core; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using protobuf; + +namespace microbench; + +public class NonDseFasterBackgroundService : BackgroundService +{ + public FasterKV kv; + private Thread checkpointThread; + private long checkpointInterval = 10; + private ThreadLocalObjectPool>> + sessions; + + private ILogger logger; + private FasterKvReservationStartFile file; + + public NonDseFasterBackgroundService(FasterKV kv, + FasterKvReservationStartFile file, ILogger logger) + { + this.kv = kv; + this.file = file; + this.logger = logger; + sessions = + new ThreadLocalObjectPool< + ClientSession>>(() => + this.kv.NewSession(new ReserveFunctions())); + } + + private void LoadFromFile(string filename) + { + using var reader = new StreamReader(filename); + var s = sessions.Checkout(); + for (var line = reader.ReadLine(); line != null; line = reader.ReadLine()) + { + var parts = line.Split(','); + var offeringId = long.Parse(parts[0]); + var entityId = long.Parse(parts[1]); + var price = int.Parse(parts[2]); + var count = int.Parse(parts[3]); + + var key = new Key(TableId.OFFERINGS, offeringId); + var val = Value.CreateOffering(offeringId, entityId, price, count); + var status = s.Upsert(ref key, ref val); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + } + + sessions.Return(s); + var task = kv.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver); + if (!task.IsCompleted) + task.AsTask().GetAwaiter().GetResult(); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + logger.LogInformation("Faster service is starting..."); + if (!file.file.Equals("")) + LoadFromFile(file.file); + checkpointThread = new Thread(() => + { + while (!stoppingToken.IsCancellationRequested) + { + kv.TryInitiateHybridLogCheckpoint(out _, CheckpointType.FoldOver); + Thread.Sleep((int)checkpointInterval); + } + }); + checkpointThread.Start(); + await Task.Delay(Timeout.InfiniteTimeSpan, stoppingToken); + logger.LogInformation("Faster service is stopping..."); + checkpointThread.Join(); + } + + public Task MakeReservation(ReservationRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingId); + var reservationCount = request.Count; + var success = false; + var status = s.RMW(ref offeringKey, ref reservationCount, ref success); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + if (!success) + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + + var reservationsKey = new Key(TableId.RESERVATIONS, request.ReservationId); + var reservationsEntry = Value.CreateReservation(request.ReservationId, request.OfferingId, + request.CustomerId, request.Count); + status = s.Upsert(ref reservationsKey, ref reservationsEntry); + if (status.IsCanceled) + { + // this reservation is a duplicate, roll back earlier update + reservationCount = -reservationCount; + status = s.RMW(ref offeringKey, ref reservationCount, ref success); + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + } + else if (status.IsCompletedSuccessfully) + { + return Task.FromResult(new ReservationResponse + { + Ok = true + }); + } + else + throw new NotImplementedException(); + } + finally + { + sessions.Return(s); + } + } + + public Task CancelReservation(ReservationRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingId); + var reservationCount = request.Count; + var reservationsKey = new Key(TableId.RESERVATIONS, request.ReservationId); + + var status = s.Delete(ref reservationsKey); + if (status.NotFound) + { + return Task.FromResult(new ReservationResponse + { + Ok = false + }); + } + + // Add updates back to count + reservationCount = -reservationCount; + var success = false; + status = s.RMW(ref offeringKey, ref reservationCount, ref success); + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new ReservationResponse + { + Ok = true + }); + } + finally + { + sessions.Return(s); + } + } + + public Task AddOffering(AddOfferingRequest request) + { + var s = sessions.Checkout(); + try + { + var offeringKey = new Key(TableId.OFFERINGS, request.OfferingToAdd.OfferingId); + var offeringEntry = Value.CreateOffering(request.OfferingToAdd.OfferingId, request.OfferingToAdd.EntityId, + request.OfferingToAdd.Price, request.OfferingToAdd.RemainingCount); + var status = s.Upsert(ref offeringKey, ref offeringEntry); + if (status.IsCanceled) + return Task.FromResult(new AddOfferingResponse + { + Ok = false + }); + // Not planning on running into larger-than-mem or other complex situations + if (!status.IsCompletedSuccessfully) throw new NotImplementedException(); + return Task.FromResult(new AddOfferingResponse + { + Ok = true + }); + } + finally + { + sessions.Return(s); + } + } +} + +public class NonDseReservationService : FasterKVReservationService.FasterKVReservationServiceBase +{ + private NonDseFasterBackgroundService faster; + + public NonDseReservationService(NonDseFasterBackgroundService faster) + { + this.faster = faster; + } + + public override Task MakeReservation(ReservationRequest request, ServerCallContext context) + { + return faster.MakeReservation(request); + } + + public override Task CancelReservation(ReservationRequest request, ServerCallContext context) + { + return faster.CancelReservation(request); + } + + public override Task AddOffering(AddOfferingRequest request, ServerCallContext context) + { + return faster.AddOffering(request); + } +} \ No newline at end of file diff --git a/cs/research/darq/SpFasterMicrobench/Program.cs b/cs/research/darq/SpFasterMicrobench/Program.cs new file mode 100644 index 000000000..aadb738e6 --- /dev/null +++ b/cs/research/darq/SpFasterMicrobench/Program.cs @@ -0,0 +1,221 @@ +using System.Diagnostics; +using System.Net; +using CommandLine; +using dse.services; +using FASTER.core; +using FASTER.libdpr; +using FASTER.libdpr.gRPC; +using Grpc.Net.Client; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using protobuf; +using Task = System.Threading.Tasks.Task; + +namespace microbench; + +public class Options +{ + [Option('t', "type", Required = true, + HelpText = "type of worker to launch")] + public string Type { get; set; } + + + [Option('w', "window", Required = false, + HelpText = "number of outstanding client requests allowed")] + public int Window { get; set; } +} + +public class Program +{ + public static async Task Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + switch (options.Type) + { + case "dse": + await LaunchDseReservationService(); + break; + case "baseline": + await LaunchNonDseReservationService(); + break; + case "client": + { + var requests = new List(); + foreach (var line in File.ReadLines( + "C:\\Users\\tianyu\\Documents\\FASTER\\cs\\research\\darq\\workloads\\workload-micro-faster-client.csv")) + { + var split = line.Split(','); + requests.Add(new ReservationRequest + { + ReservationId = long.Parse(split[2]), + OfferingId = long.Parse(split[3]), + CustomerId = long.Parse(split[4]), + Count = int.Parse(split[5]) + }); + } + var latencies = new List(); + for (var i = 0; i < requests.Count; i++) + latencies.Add(0); + + var clients = new List(); + for (var i = 0; i < 8; i++) + { + var channel = GrpcChannel.ForAddress("http://10.0.0.4:15721"); + clients.Add(new FasterKVReservationService.FasterKVReservationServiceClient(channel)); + } + + var semaphore = new SemaphoreSlim(options.Window, options.Window); + var stopwatch = Stopwatch.StartNew(); + for (var i = 0; i < requests.Count; i++) + { + + await semaphore.WaitAsync(); + var startTime = stopwatch.ElapsedTicks; + var i1 = i; + _ = Task.Run(async () => + { + await clients[i1 % 8].MakeReservationAsync(requests[i1]); + semaphore.Release(); + latencies[i1] = stopwatch.ElapsedTicks - startTime; + }); + } + await semaphore.WaitAsync(); + var totalTime = stopwatch.ElapsedMilliseconds; + Console.WriteLine($"Throughput: {1000.0 * requests.Count / totalTime}"); + + var ticksPerMillisecond = Stopwatch.Frequency / 1000.0; + + // Convert Stopwatch ticks to milliseconds + var milliseconds = latencies.Select(t => t / ticksPerMillisecond).ToList(); + milliseconds.Sort(); + var average = milliseconds.Average(); + + // Calculate median + double median = 0; + var midIndex = milliseconds.Count / 2; + if (milliseconds.Count % 2 == 0) + median = (milliseconds[midIndex - 1] + milliseconds[midIndex]) / 2.0; + else + median = milliseconds[midIndex]; + + // Calculate 95th percentile + var p95Index = (int)Math.Ceiling(0.95 * milliseconds.Count) - 1; + var p95 = milliseconds[p95Index]; + + // Output results + Console.WriteLine($"Average Latency: {average}"); + Console.WriteLine($"Median Latency: {median}"); + Console.WriteLine($"95th Percentile Latency: {p95}"); + break; + } + } + } + + public static async Task LaunchDseReservationService() + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, 15721, + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + + var checkpointManager = new DeviceLogCommitCheckpointManager( + new NullNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\service"), removeOutdated: false); + builder.Services.AddSingleton(new FasterKVSettings + { + IndexSize = 1 << 25, + LogDevice = new NullDevice(), + PageSize = 1 << 25, + SegmentSize = 1 << 30, + MemorySize = 1 << 31, + CheckpointManager = checkpointManager, + TryRecoverLatest = false, + }); + builder.Services.AddSingleton>(); + builder.Services.AddSingleton(new DprWorkerOptions + { + Me = new DprWorkerId(0), + DprFinder = new LocalStubDprFinder(), + CheckpointPeriodMilli = 10, + RefreshPeriodMilli = 5 + }); + // TODO(Tianyu): Switch implementation to epoch after testing + builder.Services.AddSingleton(typeof(IVersionScheme), typeof(RwLatchVersionScheme)); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(new FasterKvReservationStartFile + { + file = "C:\\Users\\tianyu\\Documents\\FASTER\\cs\\research\\darq\\workloads\\workload-micro-faster.csv" + }); + builder.Services.AddSingleton(); + + builder.Services.AddSingleton(); + builder.Services.AddSingleton(sp => sp.GetService()); + builder.Services.AddSingleton>(); + + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } + + public static async Task LaunchNonDseReservationService() + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, 15721, + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + }); + + var checkpointManager = new DeviceLogCommitCheckpointManager( + new NullNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\service"), removeOutdated: false); + builder.Services.AddSingleton(new FasterKVSettings + { + IndexSize = 1 << 25, + LogDevice = new NullDevice(), + PageSize = 1 << 25, + SegmentSize = 1 << 30, + MemorySize = 1 << 31, + CheckpointManager = checkpointManager, + TryRecoverLatest = false, + }); + builder.Services.AddSingleton>(); + builder.Services.AddSingleton(new FasterKvReservationStartFile + { + file = "C:\\Users\\tianyu\\Documents\\FASTER\\cs\\research\\darq\\workloads\\workload-micro-faster.csv" + }); + builder.Services.AddSingleton(); + + builder.Services.AddSingleton(); + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } +} \ No newline at end of file diff --git a/cs/research/darq/SpFasterMicrobench/SpFasterMicrobench.csproj b/cs/research/darq/SpFasterMicrobench/SpFasterMicrobench.csproj new file mode 100644 index 000000000..796357fae --- /dev/null +++ b/cs/research/darq/SpFasterMicrobench/SpFasterMicrobench.csproj @@ -0,0 +1,23 @@ + + + + true + Exe + net7.0 + enable + enable + FasterMicrobench + + + + + + + + + + + + + + diff --git a/cs/research/darq/StateObjectMicrobench/Program.cs b/cs/research/darq/StateObjectMicrobench/Program.cs new file mode 100644 index 000000000..a4565ad9f --- /dev/null +++ b/cs/research/darq/StateObjectMicrobench/Program.cs @@ -0,0 +1,118 @@ +using System.Diagnostics; +using CommandLine; +using FASTER.core; +using FASTER.libdpr; +using Task = System.Threading.Tasks.Task; + +namespace microbench; + +public class Options +{ + [Option('p', "detach-probability", Required = false, Default = 0.0, + HelpText = "probably of detach-merge in the workload. The remaining operations will be actions")] + public double DetachProbability { get; set; } + + [Option('n', "num-threads", Required = false, Default = 5, + HelpText = "number of threads doing work")] + public int NumThreads { get; set; } + + [Option('i', "checkpoint-interval", Required = false, Default = 10, + HelpText = "number of threads doing work")] + public int CheckpointInterval { get; set; } + + [Option('o', "num-ops", Required = false, Default = 1000000, + HelpText = "number of operations each thread will execute")] + public int NumOps { get; set; } +} + +public class TestStateObject : StateObject +{ + public TestStateObject(IVersionScheme versionScheme, DprWorkerOptions options) : base(versionScheme, options) + { + } + + public override void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + onPersist(); + } + + public override void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + throw new NotImplementedException(); + } + + public override void PruneVersion(long version) + { + } + + public override IEnumerable> GetUnprunedVersions() + { + yield break; + } + + public override void Dispose() + { + } +} + +public class Program +{ + public static void Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + + var tested = new TestStateObject(new EpochProtectedVersionScheme(new LightEpoch()), new DprWorkerOptions + { + Me = new DprWorkerId(0), + DprFinder = new LocalStubDprFinder(), + CheckpointPeriodMilli = options.CheckpointInterval, + RefreshPeriodMilli = 5 + }); + var backgroundTask = new StateObjectRefreshBackgroundService(null, tested); + _ = Task.Run(() => backgroundTask.StartAsync(default)); + + var random = new Random(); + var threads = new List(); + for (var i = 0; i < options.NumThreads; i++) + { + var workload = new byte[options.NumOps]; + for (var j = 0; j < options.NumOps; j++) + workload[j] = (byte)(random.NextDouble() < options.DetachProbability ? 1 : 0); + threads.Add(new Thread(() => RunBenchmarkThread(tested, workload))); + } + + var stopwatch = Stopwatch.StartNew(); + tested.ConnectToCluster(out _); + foreach (var thread in threads) + thread.Start(); + foreach (var thread in threads) + thread.Join(); + Console.WriteLine(options.NumThreads * options.NumOps * 1000.0 / stopwatch.ElapsedMilliseconds); + backgroundTask.StopAsync(default); + } + + public static void RunBenchmarkThread(TestStateObject so, byte[] workload) + { + DprSession prevSession = null; + foreach (var op in workload) + { + if (prevSession != null) + so.TryMergeAndStartAction(prevSession); + else + so.StartLocalAction(); + switch (op) + { + case 0: + so.EndAction(); + break; + case 1: + prevSession = so.DetachFromWorkerAndPauseAction(); + break; + default: + throw new NotImplementedException(); + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/StateObjectMicrobench/StateObjectMicrobench.csproj b/cs/research/darq/StateObjectMicrobench/StateObjectMicrobench.csproj new file mode 100644 index 000000000..84de6f0ef --- /dev/null +++ b/cs/research/darq/StateObjectMicrobench/StateObjectMicrobench.csproj @@ -0,0 +1,20 @@ + + + + Exe + net7.0 + enable + enable + true + + + + + + + + + + + + diff --git a/cs/research/darq/TravelReservation/Environment.cs b/cs/research/darq/TravelReservation/Environment.cs new file mode 100644 index 000000000..36343e0ec --- /dev/null +++ b/cs/research/darq/TravelReservation/Environment.cs @@ -0,0 +1,272 @@ +using Azure.Storage.Blobs; +using FASTER.core; +using FASTER.devices; +using FASTER.libdpr; + +namespace TravelReservation; + +public interface IEnvironment +{ + public string GetOrchestratorConnString(); + + public int GetOrchestratorPort(Options options); + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options); + + public IDevice GetOrchestratorDevice(Options options); + + public string GetServiceConnString(int index); + + public int GetServicePort(Options options); + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options); + + public IDevice GetServiceDevice(Options options); + + public string GetDprFinderConnString(); + + public int GetDprFinderPort(); + + public PingPongDevice GetDprFinderDevice(); + + public Task PublishResultsAsync(string fileName, MemoryStream bytes); +} + +public class LocalDebugEnvironment : IEnvironment +{ + private int roundRobin; + public string GetOrchestratorConnString() + { + var port = roundRobin++ / 2 == 0 ? 15724 : 15725; + return $"http://127.0.0.1:{port}"; + } + + public int GetOrchestratorPort(Options options) + { + return options.WorkerName + 15721; + } + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\orchestrators{options.WorkerName}"), removeOutdated: false); + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) => + new ManagedLocalStorageDevice($"D:\\orchestator{options.WorkerName}.log", deleteOnClose: true); + + public string GetServiceConnString(int index) + { + return $"http://127.0.0.1:{15721 + index}"; + } + + public int GetServicePort(Options options) + { + return 15721 + options.WorkerName; + } + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"D:\\service{options.WorkerName}"), removeOutdated: false); + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) => + new ManagedLocalStorageDevice($"D:\\service{options.WorkerName}.log", deleteOnClose: true); + + public string GetDprFinderConnString() => "http://127.0.0.1:15720"; + + public int GetDprFinderPort() => 15720; + + public PingPongDevice GetDprFinderDevice() + { + var device1 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + var device2 = new LocalMemoryDevice(1 << 24, 1 << 24, 1); + return new PingPongDevice(device1, device2, true); + } + + public Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + Console.WriteLine($"Results for {fileName}:"); + var reader = new StreamReader(bytes); + var text = reader.ReadToEnd(); + // Print to console + Console.Write(text); + return Task.CompletedTask; + } +} + +public class KubernetesLocalStorageEnvironment : IEnvironment +{ + private bool cleanStart; + + public KubernetesLocalStorageEnvironment(bool cleanStart) + { + this.cleanStart = cleanStart; + } + + public string GetOrchestratorConnString() => "http://orchestrator.dse.svc.cluster.local:15721"; + + public int GetOrchestratorPort(Options options) => 15721; + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"/mnt/plrs/orchestrators{options.WorkerName}"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) + { + if (cleanStart) + ManagedLocalStorageDevice.RemoveIfPresent($"/mnt/plrs/orchestrator{options.WorkerName}.log"); + return new ManagedLocalStorageDevice($"/mnt/plrs/orchestrator{options.WorkerName}.log"); + } + + public string GetServiceConnString(int index) => $"http://service{index}.dse.svc.cluster.local:15721"; + + public int GetServicePort(Options options) => 15721; + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new LocalStorageNamedDeviceFactory(), + new DefaultCheckpointNamingScheme($"/mnt/plrs/service{options.WorkerName}"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) + { + if (cleanStart) + ManagedLocalStorageDevice.RemoveIfPresent($"/mnt/plrs/service{options.WorkerName}.log"); + return new ManagedLocalStorageDevice($"/mnt/plrs/service{options.WorkerName}.log"); + } + + public string GetDprFinderConnString() => "http://dprfinder.dse.svc.cluster.local:15721"; + + public int GetDprFinderPort() => 15721; + + public PingPongDevice GetDprFinderDevice() + { + if (cleanStart) + { + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder1"); + ManagedLocalStorageDevice.RemoveIfPresent("/mnt/plrs/finder2"); + } + + var device1 = new ManagedLocalStorageDevice("/mnt/plrs/finder1", recoverDevice: true); + var device2 = new ManagedLocalStorageDevice("/mnt/plrs/finder2", recoverDevice: true); + return new PingPongDevice(device1, device2, true); + } + + public async Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + await blobContainerClient.CreateIfNotExistsAsync(); + var blobClient = blobContainerClient.GetBlobClient(fileName); + + await blobClient.UploadAsync(bytes, overwrite: true); + } +} + +public class KubernetesCloudStorageEnvironment : IEnvironment +{ + private bool cleanStart; + + public KubernetesCloudStorageEnvironment(bool cleanStart) + { + this.cleanStart = cleanStart; + } + + public string GetOrchestratorConnString() => "http://orchestrator.dse.svc.cluster.local:15721"; + + public int GetOrchestratorPort(Options options) => 15721; + + public FileBasedCheckpointManager GetOrchestratorCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new AzureStorageNamedDeviceFactory(Environment.GetEnvironmentVariable("AZURE_CONN_STRING")), + new DefaultCheckpointNamingScheme($"orchestrators/{options.WorkerName}/checkpoints"), + removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetOrchestratorDevice(Options options) + { + var result = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "orchestrators", + options.WorkerName.ToString(), "darq"); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public string GetServiceConnString(int index) => $"http://service{index}.dse.svc.cluster.local:15721"; + + public int GetServicePort(Options options) => 15721; + + public FileBasedCheckpointManager GetServiceCheckpointManager(Options options) + { + var result = new FileBasedCheckpointManager( + new AzureStorageNamedDeviceFactory(Environment.GetEnvironmentVariable("AZURE_CONN_STRING")), + new DefaultCheckpointNamingScheme($"services/{options.WorkerName}/checkpoints"), removeOutdated: false); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public IDevice GetServiceDevice(Options options) + { + var result = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "services", + options.WorkerName.ToString(), "log"); + if (cleanStart) + result.PurgeAll(); + return result; + } + + public string GetDprFinderConnString() => "http://dprfinder.dse.svc.cluster.local:15721"; + + public int GetDprFinderPort() => 15721; + + public PingPongDevice GetDprFinderDevice() + { + var device1 = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "dprfinder", + "data", "1"); + var device2 = new AzureStorageDevice(Environment.GetEnvironmentVariable("AZURE_CONN_STRING"), "dprfinder", + "data", "2"); + if (cleanStart) + { + device1.PurgeAll(); + device2.PurgeAll(); + } + + return new PingPongDevice(device1, device2, true); + } + + public async Task PublishResultsAsync(string fileName, MemoryStream bytes) + { + var connString = Environment.GetEnvironmentVariable("AZURE_RESULTS_CONN_STRING"); + var blobServiceClient = new BlobServiceClient(connString); + var blobContainerClient = blobServiceClient.GetBlobContainerClient("results"); + + await blobContainerClient.CreateIfNotExistsAsync(); + var blobClient = blobContainerClient.GetBlobClient(fileName); + + await blobClient.UploadAsync(bytes, overwrite: true); + } +} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/Program.cs b/cs/research/darq/TravelReservation/Program.cs new file mode 100644 index 000000000..bbd86d7fa --- /dev/null +++ b/cs/research/darq/TravelReservation/Program.cs @@ -0,0 +1,307 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Net; +using System.Text; +using CommandLine; +using FASTER.client; +using FASTER.core; +using FASTER.darq; +using FASTER.libdpr; +using FASTER.libdpr.gRPC; +using Google.Protobuf; +using Grpc.Net.Client; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using dse.services; + +namespace TravelReservation; +public class Options +{ + [Option('t', "type", Required = true, + HelpText = "type of worker to launch")] + public string Type { get; set; } + + [Option('w', "workload-trace", Required = false, + HelpText = "Workload trace file to use")] + public string WorkloadTrace { get; set; } + + [Option('o', "output-file", Required = false, + HelpText = "Name of file to output")] + public string OutputFile { get; set; } + + [Option('n', "name", Required = false, + HelpText = "identifier of the service to launch")] + public int WorkerName { get; set; } + + [Option('s', "speculative", Required = false, Default = false, + HelpText = "whether services proceed speculatively")] + public bool Speculative { get; set; } + + [Option('i', "issue-window", Required = false, Default = 128, + HelpText = "how many requests can be concurrently in-flight")] + public int IssueWindow { get; set; } +} + +public class Program +{ + public static async Task Main(string[] args) + { + ParserResult result = Parser.Default.ParseArguments(args); + if (result.Tag == ParserResultType.NotParsed) return; + var options = result.MapResult(o => o, xs => new Options()); + // var environment = new LocalDebugEnvironment(); + var environment = new KubernetesLocalStorageEnvironment(true); + + switch (options.Type.Trim()) + { + case "client": + Console.WriteLine("Starting client"); + await LaunchBenchmarkClient(options, environment); + break; + case "orchestrator": + Console.WriteLine("Starting orchestrator"); + await LaunchOrchestratorService(options, environment); + break; + case "service": + Console.WriteLine("Starting reservation service"); + await LaunchReservationService(options, environment); + break; + case "dprfinder": + Console.WriteLine("Starting DPR finder"); + await LaunchDprFinder(options, environment); + break; + case "generate": + new WorkloadGenerator() + .SetNumClients(1) + .SetNumServices(3) + .SetNumWorkflowsPerSecond(100) + .SetNumSeconds(120) + .SetNumOfferings(10000) + .SetBaseFileName("C:\\Users\\tianyu\\Desktop\\workloads\\test") + .GenerateWorkloadTrace(new Random()); + break; + default: + throw new NotImplementedException(); + } + } + + private static async Task LaunchBenchmarkClient(Options options, IEnvironment environment) + { + Console.WriteLine("Parsing workload file..."); + var timedRequests = new List<(long, ExecuteWorkflowRequest)>(); + foreach (var line in File.ReadLines(options.WorkloadTrace)) + { + var args = line.Split(','); + var timestamp = long.Parse(args[0]); + + var request = new ExecuteWorkflowRequest + { + WorkflowId = long.Parse(args[1]), + WorkflowClassId = 0, + Input = ByteString.CopyFrom(line, Encoding.UTF8) + }; + timedRequests.Add(ValueTuple.Create(timestamp, request)); + } + + Console.WriteLine("Creating gRPC connections..."); + // Keep a few channels around and reuse them + var channelPool = new List(); + for (var i = 0; i < 8; i++) + // k8 load-balancing will ensure that we get a spread of different orchestrators behind these channels + channelPool.Add(GrpcChannel.ForAddress(environment.GetOrchestratorConnString())); + var measurements = new ConcurrentBag(); + var stopwatch = Stopwatch.StartNew(); + Console.WriteLine("Starting Workload..."); + var rateLimiter = new SemaphoreSlim(options.IssueWindow, options.IssueWindow); + for (var i = 0; i < timedRequests.Count; i++) + { + var request = timedRequests[i]; + while (stopwatch.ElapsedMilliseconds <= request.Item1) + Thread.Yield(); + var channel = channelPool[i % channelPool.Count]; + var client = new WorkflowOrchestrator.WorkflowOrchestratorClient(channel); + await rateLimiter.WaitAsync(); + _ = Task.Run(async () => + { + // Console.WriteLine($"Issuing request to start workflow id:{request.Item2.WorkflowId}, request content: {request.Item2.Input.ToString(Encoding.UTF8)}"); + await client.ExecuteWorkflowAsync(request.Item2); + var endTime = stopwatch.ElapsedMilliseconds; + // Console.WriteLine($"workflow id:{request.Item2.WorkflowId} has completed in {endTime - request.Item1} milliseconds"); + measurements.Add(endTime - request.Item1); + rateLimiter.Release(); + }); + } + + while (measurements.Count != timedRequests.Count) + await Task.Delay(5); + await WriteResults(options, environment, measurements); + } + + private static async Task WriteResults(Options options, IEnvironment environment,ConcurrentBag measurements) + { + using var memoryStream = new MemoryStream(); + await using var streamWriter = new StreamWriter(memoryStream); + foreach (var line in measurements) + streamWriter.WriteLine(line); + await streamWriter.FlushAsync(); + memoryStream.Position = 0; + await environment.PublishResultsAsync(options.OutputFile, memoryStream); + } + + public static async Task LaunchOrchestratorService(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetOrchestratorPort(options), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + + var checkpointManager = environment.GetOrchestratorCheckpointManager(options); + builder.Services.AddSingleton(new DarqSettings + { + MyDpr = new DprWorkerId(options.WorkerName), + DprFinder = new GrpcDprFinder(GrpcChannel.ForAddress(environment.GetDprFinderConnString())), + LogDevice = environment.GetOrchestratorDevice(options), + LogCommitManager = checkpointManager, + PageSize = 1L << 22, + MemorySize = 1L << 30, + SegmentSize = 1L << 30, + CheckpointPeriodMilli = 10, + RefreshPeriodMilli = 5, + FastCommitMode = true, + }); + // TODO(Tianyu): Switch to epoch after testing + builder.Services.AddSingleton(typeof(IVersionScheme), typeof(RwLatchVersionScheme)); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(sp => sp.GetService()); + builder.Services.AddSingleton(new DarqMaintenanceBackgroundServiceSettings + { + morselSize = 512, + batchSize = 16, + // Workflow orchestrator DARQs never produce out messages + producerFactory = null, + speculative = true + }); + + var connectionPool = new ConcurrentDictionary(); + var workflowFactories = new Dictionary + { { 0, (input, logger) => new ReservationWorkflowStateMachine(input, connectionPool, environment, options.Speculative, logger) } }; + builder.Services.AddSingleton(new OrchestartorBackgroundProcessingServiceSettings + { + workflowFactories = workflowFactories, + speculative = options.Speculative + }); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton>(); + + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(); + builder.Services.AddHostedService(); + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + foreach (var channel in connectionPool.Values) + channel.Dispose(); + } + + public static async Task LaunchDprFinder(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetDprFinderPort(), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + using var dprFinderServiceDevice = environment.GetDprFinderDevice(); + builder.Services.AddSingleton(dprFinderServiceDevice); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + + builder.Services.AddGrpc(); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } + + public static async Task LaunchReservationService(Options options, IEnvironment environment) + { + var builder = WebApplication.CreateBuilder(); + builder.Logging.AddConsole(); + builder.Logging.SetMinimumLevel(LogLevel.Warning); + + builder.WebHost.ConfigureKestrel(serverOptions => + { + serverOptions.Listen(IPAddress.Any, environment.GetServicePort(options), + listenOptions => { listenOptions.Protocols = HttpProtocols.Http2; }); + serverOptions.Limits.MinRequestBodyDataRate = null; + }); + var checkpointManager = environment.GetServiceCheckpointManager(options); + builder.Services.AddSingleton(new FasterKVSettings + { + IndexSize = 1 << 25, + LogDevice = environment.GetServiceDevice(options), + PageSize = 1 << 25, + SegmentSize = 1 << 30, + MemorySize = 1 << 30, + CheckpointManager = checkpointManager, + TryRecoverLatest = false, + }); + builder.Services.AddSingleton>(); + builder.Services.AddSingleton(new DprWorkerOptions + { + Me = new DprWorkerId(options.WorkerName), + DprFinder = new GrpcDprFinder(GrpcChannel.ForAddress(environment.GetDprFinderConnString())), + CheckpointPeriodMilli = 10, + RefreshPeriodMilli = 5 + }); + // TODO(Tianyu): Switch implementation to epoch after testing + builder.Services.AddSingleton(typeof(IVersionScheme), typeof(RwLatchVersionScheme)); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(new FasterKvReservationStartFile + { + file = options.WorkloadTrace + }); + builder.Services.AddSingleton(); + + builder.Services.AddSingleton(); + builder.Services.AddSingleton(sp => sp.GetService()); + builder.Services.AddSingleton>(); + + builder.Services.AddGrpc(opt => { opt.Interceptors.Add>(); }); + builder.Services.AddHostedService(provider => + provider.GetRequiredService()); + builder.Services.AddHostedService(); + var app = builder.Build(); + + app.MapGrpcService(); + app.MapGet("/", + () => + "Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + await app.RunAsync(); + } +} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/ReservationWorkflow.cs b/cs/research/darq/TravelReservation/ReservationWorkflow.cs new file mode 100644 index 000000000..477ba1e75 --- /dev/null +++ b/cs/research/darq/TravelReservation/ReservationWorkflow.cs @@ -0,0 +1,215 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Text; +using dse.services; +using FASTER.common; +using FASTER.core; +using FASTER.libdpr; +using FASTER.libdpr.gRPC; +using Google.Protobuf; +using Grpc.Core.Interceptors; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using protobuf; +using DarqMessage = FASTER.libdpr.DarqMessage; +using DarqMessageType = FASTER.libdpr.DarqMessageType; + +namespace TravelReservation; + +internal enum ReservationWorkflowMessageTypes : byte +{ + RESERVATION_START, + RESERVATION_ROLLBACK +} + +internal struct ActivityDarqEntry : ILogEnqueueEntry +{ + internal long workflowId; + internal ReservationWorkflowMessageTypes type; + internal int index; + + public int SerializedLength => sizeof(long) + sizeof(ReservationWorkflowMessageTypes) + sizeof(int); + + public void SerializeTo(Span dest) + { + unsafe + { + fixed (byte* d = dest) + { + var head = d; + *(long*)head = workflowId; + head += sizeof(long); + *(ReservationWorkflowMessageTypes*)head = type; + head += sizeof(ReservationWorkflowMessageTypes); + *(int*)head = index; + } + } + } +} + +public class ReservationWorkflowStateMachine : IWorkflowStateMachine +{ + private long workflowId; + private List toExecute = new(); + private TaskCompletionSource tcs = new(); + private IDarqProcessorClientCapabilities capabilities; + private SimpleObjectPool stepRequestPool = new(() => new StepRequest()); + private ConcurrentDictionary connectionPool; + private IEnvironment environment; + private bool speculative; + private ILogger logger; + + public ReservationWorkflowStateMachine(ReadOnlySpan input, + ConcurrentDictionary connectionPool, IEnvironment environment, bool speculative, ILogger logger) + { + var messageString = Encoding.UTF8.GetString(input); + var split = messageString.Split(','); + workflowId = long.Parse(split[1]); + for (var i = 2; i < split.Length; i += 4) + { + toExecute.Add(new ReservationRequest + { + ReservationId = long.Parse(split[i]), + OfferingId = long.Parse(split[i + 1]), + CustomerId = long.Parse(split[i + 2]), + Count = int.Parse(split[i + 3]) + }); + } + + this.connectionPool = connectionPool; + this.environment = environment; + this.speculative = speculative; + this.logger = logger; + } + + public async Task GetResult(CancellationToken token) + { + var result = await tcs.Task.WaitAsync(token); + return new ExecuteWorkflowResult + { + Ok = result, + Result = ByteString.Empty, + }; + } + + public void ProcessMessage(DarqMessage m) + { + if (m.GetMessageBody().Length == sizeof(long)) + { + // Then this is the initial message, bootstrap the state machine and begin execution + var stepRequest = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(stepRequest); + + requestBuilder.AddSelfMessage(new ActivityDarqEntry + { + workflowId = workflowId, + type = ReservationWorkflowMessageTypes.RESERVATION_START, + index = 0 + }); + requestBuilder.MarkMessageConsumed(m.GetLsn()); + m.Dispose(); + + // Will always be completed synchronously + Task.Run(async () => + { + await capabilities.Step(requestBuilder.FinishStep()); + stepRequestPool.Return(stepRequest); + }); + return; + } + + Debug.Assert(m.GetMessageType() == DarqMessageType.IN); + var lsn = m.GetLsn(); + var type = (ReservationWorkflowMessageTypes) m.GetMessageBody()[sizeof(long)]; + var index = BitConverter.ToInt32( + m.GetMessageBody()[(sizeof(long) + sizeof(ReservationWorkflowMessageTypes))..]); + if (type == ReservationWorkflowMessageTypes.RESERVATION_START) + MakeReservation(lsn, index); + else + CancelReservation(lsn, index); + m.Dispose(); + } + + private void MakeReservation(long lsn, int index) + { + if (index == toExecute.Count) + { + // logger.LogInformation($"Workflow with id {workflowId} completed successfully"); + // We are done and there are no more reservations to make + tcs.SetResult(true); + return; + } + + var c = capabilities; + Task.Run(async () => + { + var channel = connectionPool.GetOrAdd(index, + i => GrpcChannel.ForAddress(environment.GetServiceConnString(i))); + var client = speculative + ? new FasterKVReservationService.FasterKVReservationServiceClient( + channel.Intercept(new DprClientInterceptor(c.GetDprSession()))) + : new FasterKVReservationService.FasterKVReservationServiceClient(channel); + + // logger.LogInformation($"Workflow with id {workflowId} is starting reservation number {index}"); + var result = await client.MakeReservationAsync(toExecute[index]); + // logger.LogInformation($"Workflow with id {workflowId} has completed reservation number {index}"); + var stepRequest = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(stepRequest); + requestBuilder.MarkMessageConsumed(lsn); + requestBuilder.AddSelfMessage(new ActivityDarqEntry + { + workflowId = workflowId, + type = result.Ok + ? ReservationWorkflowMessageTypes.RESERVATION_START + : ReservationWorkflowMessageTypes.RESERVATION_ROLLBACK, + index = result.Ok ? index + 1 : index - 1 + }); + // Will always be completed synchronously + await c.Step(requestBuilder.FinishStep()); + stepRequestPool.Return(stepRequest); + }); + } + + private void CancelReservation(long lsn, int index) + { + if (index == -1) + { + // logger.LogInformation($"Workflow with id {workflowId} completed with rollback"); + // We are done and there are no more reservations to make + tcs.SetResult(false); + return; + } + var c = capabilities; + Task.Run(async () => + { + var channel = connectionPool.GetOrAdd(index, + k => GrpcChannel.ForAddress(environment.GetServiceConnString(index))); + var client = speculative + ? new FasterKVReservationService.FasterKVReservationServiceClient( + channel.Intercept(new DprClientInterceptor(c.GetDprSession()))) + : new FasterKVReservationService.FasterKVReservationServiceClient(channel); + + // logger.LogInformation($"Workflow with id {workflowId} is cancelling reservation number {index}"); + await client.CancelReservationAsync(toExecute[index]); + // logger.LogInformation($"Workflow with id {workflowId} has cancelled reservation number {index}"); + var stepRequest = stepRequestPool.Checkout(); + var requestBuilder = new StepRequestBuilder(stepRequest); + requestBuilder.MarkMessageConsumed(lsn); + requestBuilder.AddSelfMessage(new ActivityDarqEntry + { + workflowId = workflowId, + type = ReservationWorkflowMessageTypes.RESERVATION_ROLLBACK, + index = index - 1 + }); + // Will always be completed synchronously + await c.Step(requestBuilder.FinishStep()); + stepRequestPool.Return(stepRequest); + }); + } + + public void OnRestart(IDarqProcessorClientCapabilities capabilities, StateObject backend) + { + this.capabilities = capabilities; + // TODO(Tianyu): currently not actually a restart -- will only be called once at start and can only handle that + } +} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/TravelReservation.csproj b/cs/research/darq/TravelReservation/TravelReservation.csproj new file mode 100644 index 000000000..3b80de3bb --- /dev/null +++ b/cs/research/darq/TravelReservation/TravelReservation.csproj @@ -0,0 +1,60 @@ + + + + Exe + true + net7.0 + enable + enable + SimpleWorkflowBench + true + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + templates + + + + diff --git a/cs/research/darq/TravelReservation/WorkloadGenerator.cs b/cs/research/darq/TravelReservation/WorkloadGenerator.cs new file mode 100644 index 000000000..9409f5c4d --- /dev/null +++ b/cs/research/darq/TravelReservation/WorkloadGenerator.cs @@ -0,0 +1,129 @@ +using System.Text; +using MathNet.Numerics.Distributions; + +namespace TravelReservation; + +public class WorkloadGenerator +{ + private int numClients, numServices, numWorkflowsPerSecond, numSeconds, numOfferings; + private string baseFileName; + + public WorkloadGenerator SetNumClients(int numClients) + { + this.numClients = numClients; + return this; + } + + public WorkloadGenerator SetNumServices(int numServices) + { + this.numServices = numServices; + return this; + } + + public WorkloadGenerator SetNumWorkflowsPerSecond(int numWorkflowsPerSecond) + { + this.numWorkflowsPerSecond = numWorkflowsPerSecond; + return this; + } + + public WorkloadGenerator SetNumSeconds(int numSeconds) + { + this.numSeconds = numSeconds; + return this; + } + + public WorkloadGenerator SetNumOfferings(int numOfferings) + { + this.numOfferings = numOfferings; + return this; + } + + public WorkloadGenerator SetBaseFileName(string baseFileName) + { + this.baseFileName = baseFileName; + return this; + } + + public void GenerateWorkloadTrace(Random random) + { + // Generate database + // Over provision a to ensure we don't all abort + var numOfferingsRequired = (int) Math.Ceiling(numClients * numWorkflowsPerSecond * numSeconds * 4.0 / numOfferings); + for (var i = 0; i < numServices; i++) + { + using var writer = new StreamWriter($"{baseFileName}-service-{i}.csv"); + for (var j = 0; j < numOfferings; j++) + { + var builder = new StringBuilder(); + // offering id + builder.Append(j); + builder.Append(','); + // entityId id + builder.Append(random.NextInt64()); + builder.Append(','); + // price + builder.Append(random.Next(100, 300)); + builder.Append(','); + // num reservable + // TODO(Tianyu): Randomize a bit? + builder.Append(numOfferingsRequired); + writer.WriteLine(builder.ToString()); + } + } + + // Generate workload + var poisson = new Poisson(numWorkflowsPerSecond); + for (var i = 0; i < numClients; i++) + { + List requestTimestamps = new(); + for (var second = 0; second < numSeconds; second++) + { + var numRequests = poisson.Sample(); + for (var request = 0; request < numRequests; request++) + requestTimestamps.Add(1000 * second + random.Next(1000)); + } + + requestTimestamps.Sort(); + var uniqueIds = new Dictionary(); + using var writer = new StreamWriter($"{baseFileName}-client-{i}.csv"); + foreach (var timestamp in requestTimestamps) + { + var builder = new StringBuilder(); + // Issue time + builder.Append(timestamp); + builder.Append(','); + // Workflow Id -- must ensure uniqueness + long id; + do + { + id = random.NextInt64(0xFFFFFFFFFFFF) / numClients + i; + } while (!uniqueIds.TryAdd(id, 0)); + builder.Append(id); + + for (var j = 0; j < numServices; j++) + { + builder.Append(','); + // Reservation Id -- must ensure uniqueness + do + { + id = random.NextInt64(0xFFFFFFFFFFFF) / numClients + i; + } while (!uniqueIds.TryAdd(id, 0)); + + builder.Append(id); + builder.Append(','); + // offeringId + builder.Append(random.NextInt64(numOfferings)); + builder.Append(','); + // customerId + builder.Append(random.NextInt64()); + builder.Append(','); + // count + // TODO(Tianyu): More than 1? + builder.Append(1); + } + + writer.WriteLine(builder.ToString()); + } + } + } +} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/Chart.yaml b/cs/research/darq/TravelReservation/helm-storage/Chart.yaml new file mode 100644 index 000000000..f25da764f --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: reservation-storage +description: Helm chart for TravelReservation workload (storage resources) +type: application +version: 0.1.0 +appVersion: "1.16.0" \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/templates/dprfinder-pvc.yaml b/cs/research/darq/TravelReservation/helm-storage/templates/dprfinder-pvc.yaml new file mode 100644 index 000000000..d91dc5cd9 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/templates/dprfinder-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: dprfinder-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: premium-lrs + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/templates/reservation-pvc.yaml b/cs/research/darq/TravelReservation/helm-storage/templates/reservation-pvc.yaml new file mode 100644 index 000000000..d77928f43 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/templates/reservation-pvc.yaml @@ -0,0 +1,14 @@ +{{- range .Values.services }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: service{{ .num }}-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: premium-lrs + resources: + requests: + storage: 10Gi +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/templates/storageclass.yaml b/cs/research/darq/TravelReservation/helm-storage/templates/storageclass.yaml new file mode 100644 index 000000000..9e19c8900 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/templates/storageclass.yaml @@ -0,0 +1,9 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: premium-lrs +provisioner: kubernetes.io/azure-disk +parameters: + storageaccounttype: Premium_LRS +reclaimPolicy: Delete +volumeBindingMode: Immediate \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/templates/workflow-pvc.yaml b/cs/research/darq/TravelReservation/helm-storage/templates/workflow-pvc.yaml new file mode 100644 index 000000000..8c1603721 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/templates/workflow-pvc.yaml @@ -0,0 +1,14 @@ +{{- range .Values.orchestrators }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: orchestrator{{ .num }}-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: premium-lrs + resources: + requests: + storage: 5Gi +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-storage/values.yaml b/cs/research/darq/TravelReservation/helm-storage/values.yaml new file mode 100644 index 000000000..a10ef4902 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-storage/values.yaml @@ -0,0 +1,18 @@ +conn_string: foo +results_conn_string: foo +experiment: TravelReservation-latency +workload: workload-tiny + +services: + - num: 0 + port: 15721 + +orchestrator_port: 15721 + +orchestrators: + - num: 1 + +clients: + - num: 0 + +speculative: false \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/Chart.yaml b/cs/research/darq/TravelReservation/helm-workload/Chart.yaml new file mode 100644 index 000000000..279950e87 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: reservation-workload +description: Helm chart for TravelReservation workload (clients, services, and orchestrator) +type: application +version: 0.1.0 +appVersion: "1.16.0" \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/client-jobs.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/client-jobs.yaml new file mode 100644 index 000000000..d3ce1fe3d --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/client-jobs.yaml @@ -0,0 +1,51 @@ +{{- range .Values.clients }} +apiVersion: batch/v1 +kind: Job +metadata: + name: client{{ .num }} +spec: + template: + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-all + image: bitnami/kubectl + command: [ "sh", "-c" ] + args: + - > + while true; do + # Check if all deployments have at least one ready replica + notReadyCount=$(kubectl get deployments -n dse | grep -c "0/1") + if [ "$notReadyCount" -eq 0 ]; then + echo "All deployments are ready."; + break; + else + echo "Waiting for all deployments to be ready. $notReadyCount deployments are not ready."; + sleep 1; + fi; + done; + containers: + - command: + - "TravelReservation/TravelReservation" + args: + - "-t" + - "client" + - "-w" + - "{{ $.Values.experiment }}/workloads/{{ $.Values.workload }}-client-{{ .num }}.csv" + - "-n" + - "{{ .num }}" + - "-o" + - "{{ $.Values.workload }}-result-{{ .num }}-{{- if $.Values.speculative }}speculative{{- end }}.txt" + - "-i" + - "{{ $.Values.window }}" + image: tianyuli96/faster:latest + name: client{{ .num }} + ports: + - containerPort: 15721 + envFrom: + - configMapRef: + name: env-config + restartPolicy: Never +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-deployment.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-deployment.yaml new file mode 100644 index 000000000..3a897dd4b --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + replicas: 1 + selector: + matchLabels: + service: dprfinder + strategy: {} + template: + metadata: + labels: + service: dprfinder + spec: + priorityClassName: high-priority + containers: + - command: + - "TravelReservation/TravelReservation" + args: + - "-t" + - "dprfinder" + image: tianyuli96/faster:latest + name: dprfinder + ports: + - containerPort: 15721 + - containerPort: 4022 + volumeMounts: + - name: dprfinderstorage + mountPath: "/mnt/plrs" + envFrom: + - configMapRef: + name: env-config + volumes: + - name: dprfinderstorage + persistentVolumeClaim: + claimName: dprfinder-pvc + nodeSelector: + nodepool: dsebench + restartPolicy: Always \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-service.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-service.yaml new file mode 100644 index 000000000..5268bde33 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/dprfinder-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + service: dprfinder + name: dprfinder +spec: + ports: + - name: "15721" + port: 15721 + targetPort: 15721 + selector: + service: dprfinder diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/env-config.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/env-config.yaml new file mode 100644 index 000000000..a876ec967 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/env-config.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: env-config + namespace: dse +data: + AZURE_CONN_STRING: "{{ .Values.conn_string }}" + AZURE_RESULTS_CONN_STRING: "{{ .Values.results_conn_string }}" \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/reservation-deployments.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/reservation-deployments.yaml new file mode 100644 index 000000000..e6fff53a8 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/reservation-deployments.yaml @@ -0,0 +1,57 @@ +{{- range .Values.services }} +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + service: service{{ .num }} + name: service{{ .num }} +spec: + replicas: 1 + selector: + matchLabels: + service: service{{ .num }} + strategy: {} + template: + metadata: + labels: + service: service{{ .num }} + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-for-dprfinder{{ .num }} + image: bitnami/kubectl + command: [ 'sh', '-c', 'until kubectl get deployment -l service=dprfinder -n dse | grep -q "1/1"; do echo waiting for deployment-b; sleep 1; done;' ] + containers: + - command: + - "TravelReservation/TravelReservation" + args: + - "-t" + - "service" + - "-w" + - "{{ $.Values.experiment }}/workloads/{{ $.Values.workload }}-service-{{ .num }}.csv" + - "-n" + - "{{ .num }}" + {{- if $.Values.services }} + - "-s" + {{- end }} + image: tianyuli96/faster:latest + name: service{{ .num }} + ports: + - containerPort: 15721 + - containerPort: 4022 + volumeMounts: + - name: servicestorage{{ .num }} + mountPath: "/mnt/plrs" + envFrom: + - configMapRef: + name: env-config + volumes: + - name: servicestorage{{ .num }} + persistentVolumeClaim: + claimName: service{{ .num }}-pvc + nodeSelector: + nodepool: dsebench + restartPolicy: Always +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/reservation-services.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/reservation-services.yaml new file mode 100644 index 000000000..6d5884d43 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/reservation-services.yaml @@ -0,0 +1,16 @@ +{{- range .Values.services }} +apiVersion: v1 +kind: Service +metadata: + labels: + service: "service{{ .num }}" + name: "service{{ .num }}" +spec: + ports: + - name: "15721" + port: 15721 + targetPort: {{ .port }} + selector: + service: "service{{ .num }}" +--- +{{- end }} \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/service-role-auth.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/service-role-auth.yaml new file mode 100644 index 000000000..ff978ccc0 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/service-role-auth.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: deployment-watcher + namespace: dse +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: deployment-reader + namespace: dse +rules: + - apiGroups: [ "apps" ] + resources: [ "deployments" ] + verbs: [ "get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: read-deployments + namespace: dse +subjects: + - kind: ServiceAccount + name: deployment-watcher + namespace: dse +roleRef: + kind: Role + name: deployment-reader + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/workflow-deployment.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/workflow-deployment.yaml new file mode 100644 index 000000000..977bf6d28 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/workflow-deployment.yaml @@ -0,0 +1,55 @@ +{{- range .Values.orchestrators }} +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + service: "orchestrator" + name: "orchestrator{{ .num }}" +spec: + replicas: 1 + selector: + matchLabels: + service: "orchestrator" + strategy: {} + template: + metadata: + labels: + service: "orchestrator" + spec: + priorityClassName: high-priority + serviceAccountName: deployment-watcher + initContainers: + - name: wait-for-dprfinder{{ .num }} + image: bitnami/kubectl + command: [ 'sh', '-c', 'until kubectl get deployment -l service=dprfinder -n dse | grep -q "1/1"; do echo waiting for deployment-b; sleep 1; done;' ] + containers: + - command: + - "TravelReservation/TravelReservation" + args: + - "-t" + - "orchestrator" + - "-n" + - "{{ .num }}" + {{- if $.Values.speculative }} + - "-s" + {{- end }} + image: tianyuli96/faster:latest + name: "orchestrator{{ .num }}" + ports: + - containerPort: {{ $.Values.orchestrator_port }} + - containerPort: 4022 + volumeMounts: + - name: orchestratorstorage{{ .num }} + mountPath: "/mnt/plrs" + envFrom: + - configMapRef: + name: env-config + volumes: + - name: orchestratorstorage{{ .num }} + persistentVolumeClaim: + claimName: orchestrator{{ .num }}-pvc + nodeSelector: + nodepool: dsebench + restartPolicy: Always +--- +{{- end }} diff --git a/cs/research/darq/TravelReservation/helm-workload/templates/workflow-service.yaml b/cs/research/darq/TravelReservation/helm-workload/templates/workflow-service.yaml new file mode 100644 index 000000000..96bfcf827 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/templates/workflow-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + service: "orchestrator" + name: "orchestrator" +spec: + ports: + - name: "15721" + port: 15721 + targetPort: {{ .Values.orchestrator_port }} + selector: + service: "orchestrator" diff --git a/cs/research/darq/TravelReservation/helm-workload/values.yaml b/cs/research/darq/TravelReservation/helm-workload/values.yaml new file mode 100644 index 000000000..83e4da541 --- /dev/null +++ b/cs/research/darq/TravelReservation/helm-workload/values.yaml @@ -0,0 +1,19 @@ +conn_string: foo +results_conn_string: foo +experiment: TravelReservation-latency +workload: workload-tiny +window: 32 + +services: + - num: 0 + port: 15721 + +orchestrator_port: 15721 + +orchestrators: + - num: 1 + +clients: + - num: 0 + +speculative: false \ No newline at end of file diff --git a/cs/research/darq/darq.sln b/cs/research/darq/darq.sln new file mode 100644 index 000000000..a66ccb0d6 --- /dev/null +++ b/cs/research/darq/darq.sln @@ -0,0 +1,114 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.darq", "FASTER.darq\FASTER.darq.csproj", "{B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.libdpr", "..\libdpr\src\FASTER.libdpr\FASTER.libdpr.csproj", "{6595C1EF-2710-4E64-AD31-9B784F186468}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.common", "..\..\remote\src\FASTER.common\FASTER.common.csproj", "{435381D2-9117-4E71-98D4-8DFD9971AB11}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.core", "..\..\src\core\FASTER.core.csproj", "{EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.client", "..\..\remote\src\FASTER.client\FASTER.client.csproj", "{27CF2EE1-3FB1-4C0A-9354-F58EB2961157}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.server", "..\..\remote\src\FASTER.server\FASTER.server.csproj", "{02C46B10-DF7E-469B-A8CD-B7AE7F504A24}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "playground", "playground", "{4996CBEF-5873-485D-BF02-63240C9033C0}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{89B55DF7-9C0E-4686-BCB7-BBAE2707A17A}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TravelReservation", "TravelReservation\TravelReservation.csproj", "{12B67692-FE42-4D25-A721-58887E3D753E}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ExampleServices", "ExampleServices\ExampleServices.csproj", "{F3E0A24A-348F-460F-875E-5A27C9FEB24E}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.devices.AzureStorageDevice", "..\..\src\devices\AzureStorageDevice\FASTER.devices.AzureStorageDevice.csproj", "{CE36DEFA-ACD7-448C-B658-B386CD257E59}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EventProcessing", "EventProcessing\EventProcessing.csproj", "{87F655AD-4E00-4FBD-91B6-A0421BA24E4A}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StateObjectMicrobench", "StateObjectMicrobench\StateObjectMicrobench.csproj", "{6005BD11-340A-4757-9F28-0CC1BA0C95D9}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpFasterMicrobench", "SpFasterMicrobench\SpFasterMicrobench.csproj", "{F9FAE014-CE56-4A15-B334-CB492FF818DF}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CoordinatorMicrobench", "CoordinatorMicrobench\CoordinatorMicrobench.csproj", "{9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DistributedTransactions", "DistributedTransactions\DistributedTransactions.csproj", "{20E326F4-3D33-46A1-8932-A337F59365E3}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB}.Release|Any CPU.Build.0 = Release|Any CPU + {6595C1EF-2710-4E64-AD31-9B784F186468}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {6595C1EF-2710-4E64-AD31-9B784F186468}.Debug|Any CPU.Build.0 = Debug|Any CPU + {6595C1EF-2710-4E64-AD31-9B784F186468}.Release|Any CPU.ActiveCfg = Release|Any CPU + {6595C1EF-2710-4E64-AD31-9B784F186468}.Release|Any CPU.Build.0 = Release|Any CPU + {435381D2-9117-4E71-98D4-8DFD9971AB11}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {435381D2-9117-4E71-98D4-8DFD9971AB11}.Debug|Any CPU.Build.0 = Debug|Any CPU + {435381D2-9117-4E71-98D4-8DFD9971AB11}.Release|Any CPU.ActiveCfg = Release|Any CPU + {435381D2-9117-4E71-98D4-8DFD9971AB11}.Release|Any CPU.Build.0 = Release|Any CPU + {EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0}.Release|Any CPU.Build.0 = Release|Any CPU + {27CF2EE1-3FB1-4C0A-9354-F58EB2961157}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {27CF2EE1-3FB1-4C0A-9354-F58EB2961157}.Debug|Any CPU.Build.0 = Debug|Any CPU + {27CF2EE1-3FB1-4C0A-9354-F58EB2961157}.Release|Any CPU.ActiveCfg = Release|Any CPU + {27CF2EE1-3FB1-4C0A-9354-F58EB2961157}.Release|Any CPU.Build.0 = Release|Any CPU + {02C46B10-DF7E-469B-A8CD-B7AE7F504A24}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {02C46B10-DF7E-469B-A8CD-B7AE7F504A24}.Debug|Any CPU.Build.0 = Debug|Any CPU + {02C46B10-DF7E-469B-A8CD-B7AE7F504A24}.Release|Any CPU.ActiveCfg = Release|Any CPU + {02C46B10-DF7E-469B-A8CD-B7AE7F504A24}.Release|Any CPU.Build.0 = Release|Any CPU + {12B67692-FE42-4D25-A721-58887E3D753E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {12B67692-FE42-4D25-A721-58887E3D753E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {12B67692-FE42-4D25-A721-58887E3D753E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {12B67692-FE42-4D25-A721-58887E3D753E}.Release|Any CPU.Build.0 = Release|Any CPU + {F3E0A24A-348F-460F-875E-5A27C9FEB24E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F3E0A24A-348F-460F-875E-5A27C9FEB24E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F3E0A24A-348F-460F-875E-5A27C9FEB24E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F3E0A24A-348F-460F-875E-5A27C9FEB24E}.Release|Any CPU.Build.0 = Release|Any CPU + {CE36DEFA-ACD7-448C-B658-B386CD257E59}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CE36DEFA-ACD7-448C-B658-B386CD257E59}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CE36DEFA-ACD7-448C-B658-B386CD257E59}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CE36DEFA-ACD7-448C-B658-B386CD257E59}.Release|Any CPU.Build.0 = Release|Any CPU + {87F655AD-4E00-4FBD-91B6-A0421BA24E4A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {87F655AD-4E00-4FBD-91B6-A0421BA24E4A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {87F655AD-4E00-4FBD-91B6-A0421BA24E4A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {87F655AD-4E00-4FBD-91B6-A0421BA24E4A}.Release|Any CPU.Build.0 = Release|Any CPU + {6005BD11-340A-4757-9F28-0CC1BA0C95D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {6005BD11-340A-4757-9F28-0CC1BA0C95D9}.Debug|Any CPU.Build.0 = Debug|Any CPU + {6005BD11-340A-4757-9F28-0CC1BA0C95D9}.Release|Any CPU.ActiveCfg = Release|Any CPU + {6005BD11-340A-4757-9F28-0CC1BA0C95D9}.Release|Any CPU.Build.0 = Release|Any CPU + {F9FAE014-CE56-4A15-B334-CB492FF818DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F9FAE014-CE56-4A15-B334-CB492FF818DF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F9FAE014-CE56-4A15-B334-CB492FF818DF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F9FAE014-CE56-4A15-B334-CB492FF818DF}.Release|Any CPU.Build.0 = Release|Any CPU + {9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6}.Release|Any CPU.Build.0 = Release|Any CPU + {20E326F4-3D33-46A1-8932-A337F59365E3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {20E326F4-3D33-46A1-8932-A337F59365E3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {20E326F4-3D33-46A1-8932-A337F59365E3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {20E326F4-3D33-46A1-8932-A337F59365E3}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {27CF2EE1-3FB1-4C0A-9354-F58EB2961157} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {435381D2-9117-4E71-98D4-8DFD9971AB11} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {EC0B4D5B-8DB6-45BA-8FD8-F1BF0A18CEA0} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {B6949D0A-D49C-4B88-BEEC-FFEA1CD018CB} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {6595C1EF-2710-4E64-AD31-9B784F186468} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {02C46B10-DF7E-469B-A8CD-B7AE7F504A24} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {12B67692-FE42-4D25-A721-58887E3D753E} = {4996CBEF-5873-485D-BF02-63240C9033C0} + {F3E0A24A-348F-460F-875E-5A27C9FEB24E} = {4996CBEF-5873-485D-BF02-63240C9033C0} + {CE36DEFA-ACD7-448C-B658-B386CD257E59} = {89B55DF7-9C0E-4686-BCB7-BBAE2707A17A} + {87F655AD-4E00-4FBD-91B6-A0421BA24E4A} = {4996CBEF-5873-485D-BF02-63240C9033C0} + {6005BD11-340A-4757-9F28-0CC1BA0C95D9} = {4996CBEF-5873-485D-BF02-63240C9033C0} + {F9FAE014-CE56-4A15-B334-CB492FF818DF} = {4996CBEF-5873-485D-BF02-63240C9033C0} + {9EA93D69-9C33-4CDD-A7DE-E9C766ADF9E6} = {4996CBEF-5873-485D-BF02-63240C9033C0} + EndGlobalSection +EndGlobal diff --git a/cs/research/libdpr/libdpr.sln b/cs/research/libdpr/libdpr.sln new file mode 100644 index 000000000..2d352477b --- /dev/null +++ b/cs/research/libdpr/libdpr.sln @@ -0,0 +1,53 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{3194EF49-D59E-438E-BD68-04FA6CC570E7}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.libdpr", "src\FASTER.libdpr\FASTER.libdpr.csproj", "{3CE274C4-1764-4A26-BEED-234B2698B614}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{0E3DF871-4507-4F77-9AEC-DB3924388974}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.libdpr.test", "test\FASTER.libdpr.test\FASTER.libdpr.test.csproj", "{CE867101-9A33-446D-A1E0-D343C05B31F5}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.core", "..\..\src\core\FASTER.core.csproj", "{868D2902-921D-462D-884F-D3500EFDFB0E}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FASTER.common", "..\..\remote\src\FASTER.common\FASTER.common.csproj", "{2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HotelReservationBench", "src\HotelReservationBench\HotelReservationBench.csproj", "{002E90F7-8EBF-4B3E-8E6E-578BD1A06C66}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "playground", "playground", "{10EAF317-499E-439C-944B-A69B9A7B9501}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3CE274C4-1764-4A26-BEED-234B2698B614}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3CE274C4-1764-4A26-BEED-234B2698B614}.Release|Any CPU.Build.0 = Release|Any CPU + {3CE274C4-1764-4A26-BEED-234B2698B614}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3CE274C4-1764-4A26-BEED-234B2698B614}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CE867101-9A33-446D-A1E0-D343C05B31F5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CE867101-9A33-446D-A1E0-D343C05B31F5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CE867101-9A33-446D-A1E0-D343C05B31F5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CE867101-9A33-446D-A1E0-D343C05B31F5}.Release|Any CPU.Build.0 = Release|Any CPU + {868D2902-921D-462D-884F-D3500EFDFB0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {868D2902-921D-462D-884F-D3500EFDFB0E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {868D2902-921D-462D-884F-D3500EFDFB0E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {868D2902-921D-462D-884F-D3500EFDFB0E}.Release|Any CPU.Build.0 = Release|Any CPU + {2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973}.Release|Any CPU.Build.0 = Release|Any CPU + {002E90F7-8EBF-4B3E-8E6E-578BD1A06C66}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {002E90F7-8EBF-4B3E-8E6E-578BD1A06C66}.Debug|Any CPU.Build.0 = Debug|Any CPU + {002E90F7-8EBF-4B3E-8E6E-578BD1A06C66}.Release|Any CPU.ActiveCfg = Release|Any CPU + {002E90F7-8EBF-4B3E-8E6E-578BD1A06C66}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {3CE274C4-1764-4A26-BEED-234B2698B614} = {3194EF49-D59E-438E-BD68-04FA6CC570E7} + {CE867101-9A33-446D-A1E0-D343C05B31F5} = {0E3DF871-4507-4F77-9AEC-DB3924388974} + {868D2902-921D-462D-884F-D3500EFDFB0E} = {3194EF49-D59E-438E-BD68-04FA6CC570E7} + {2470FBCF-3CC5-41B1-9BEB-CCF4AB8C2973} = {3194EF49-D59E-438E-BD68-04FA6CC570E7} + {002E90F7-8EBF-4B3E-8E6E-578BD1A06C66} = {10EAF317-499E-439C-944B-A69B9A7B9501} + EndGlobalSection +EndGlobal diff --git a/cs/research/libdpr/samples/DprCounters/DprCounters/CounterClient.cs b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterClient.cs new file mode 100644 index 000000000..81eb78812 --- /dev/null +++ b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterClient.cs @@ -0,0 +1,103 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Net; +using System.Net.Sockets; +using System.Threading.Tasks; +using FASTER.libdpr; + +namespace DprCounters +{ + /// + /// Client to a cluster of CounterServers. DPR-capable. + /// + public class CounterClient + { + private DprSession session; + private Dictionary cluster; + private byte[] serializationBuffer = new byte[1 << 15]; + private long serialNum = 0; + private ClientVersionTracker versionTracker = new(); + private IDprFinder dprFinder; + + /// + /// Create a new client session + /// + /// dpr session + /// static cluster mapping + public CounterClient(IDprFinder dprFinder, Dictionary cluster) + { + this.dprFinder = dprFinder; + this.session = new DprSession(); + this.cluster = cluster; + } + + /// + /// Increments the counter at the given location by the given amount + /// + /// counter location + /// amount to increment counter by + /// result + /// unique id for operation + public long Increment(WorkerId worker, long amount, out long result) + { + var id = serialNum++; + // Add unique id to tracking + versionTracker.Add(id); + // Before sending operations, consult with DPR client for a batch header. For this simple example, we + // are using one message per batch + var headerSize = session.ComputeHeaderForSend(new Span(serializationBuffer, sizeof(int), + serializationBuffer.Length - sizeof(int))); + // Use a serialization scheme that writes a size field and then the DPR header and request in sequence. + BitConverter.TryWriteBytes(new Span(serializationBuffer, 0, sizeof(int)), + headerSize + sizeof(long)); + BitConverter.TryWriteBytes(new Span(serializationBuffer, headerSize + sizeof(int), sizeof(long)), amount); + + // For simplicity, start a new socket every operation + var endPoint = cluster[worker]; + using var socket = new Socket(endPoint.AddressFamily, SocketType.Stream, ProtocolType.Tcp); + socket.Connect(endPoint); + socket.Send(serializationBuffer, 0, sizeof(int) + headerSize + sizeof(long), SocketFlags.None); + + // We expect the same format back from server. First read the size field + var receivedBytes = 0; + while (receivedBytes < sizeof(int)) + receivedBytes += socket.Receive(serializationBuffer, receivedBytes, serializationBuffer.Length - receivedBytes, SocketFlags.None); + + var size = BitConverter.ToInt32(serializationBuffer); + // Now wait until the entire message arrives + while (receivedBytes < size + sizeof(int)) + receivedBytes += socket.Receive(serializationBuffer, receivedBytes, serializationBuffer.Length - receivedBytes, SocketFlags.None); + + // Forward the DPR response header after we are done + var status = session.ReceiveHeader(new Span(serializationBuffer, sizeof(int), size - sizeof(long)), out var v); + // Because the program does not simulate failures, we should never fail + Debug.Assert(status == DprBatchStatus.OK); + + versionTracker.Resolve(id, v); + + // (Non-DPR) Response is 8 bytes, + result = BitConverter.ToInt64(serializationBuffer, sizeof(int) + size - sizeof(long)); + return id; + } + + /// + /// Check whether the operation identified by seq is committed + /// + /// operation to check + /// whether operation is committed + public async ValueTask Committed(long seq) + { + while (true) + { + dprFinder.Refresh(); + versionTracker.HandleCommit(dprFinder.GetStateSnapshot()); + var cp = versionTracker.GetCommitPoint(); + // Because the session is strictly sequential, operation will never be in exception list. + Debug.Assert(cp.ExcludedSerialNos.Count == 0); + if (seq < cp.UntilSerialNo) return; + await Task.Delay(10); + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/samples/DprCounters/DprCounters/CounterServer.cs b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterServer.cs new file mode 100644 index 000000000..90e514c17 --- /dev/null +++ b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterServer.cs @@ -0,0 +1,137 @@ +using System; +using System.Net; +using System.Net.Sockets; +using System.Runtime.InteropServices; +using System.Threading; +using FASTER.libdpr; + +namespace DprCounters +{ + /// + /// A single-threaded blocking server that accepts requests to atomically increment a counter. DPR-protected. + /// + public class CounterServer + { + private Socket socket; + private DprWorker dprWorker; + private ManualResetEventSlim termination; + + /// + /// Create a new CounterServer. + /// + /// ip address to listen + /// port number to listen + /// id of worker in DPR cluster + /// directory name to write checkpoint files to + /// DprFinder for the cluster + public CounterServer(string ip, int port, WorkerId me, string checkpointDir, IDprFinder dprFinder) + { + // Each DPR worker should be backed by one state object. The state object exposes some methods + // for the DPR logic to invoke when necessary, but DPR does not otherwise mediate user interactions + // with it. + var stateObject = new CounterStateObject(checkpointDir); + // A DPR server provides DPR methods that the users should invoke at appropriate points of execution. There + // should be one DPR server per worker in the cluster + dprWorker = new DprWorker(me, stateObject, dprFinder); + + var localEndpoint = new IPEndPoint(IPAddress.Parse(ip), port); + socket = new Socket(localEndpoint.AddressFamily, SocketType.Stream, ProtocolType.Tcp); + socket.Bind(localEndpoint); + } + + public void RunServer() + { + dprWorker.ConnectToCluster(); + + termination = new ManualResetEventSlim(); + // DprServer must be continually refreshed and checkpointed for the system to make progress. It is easiest + // to simply spawn a background thread to do that. + var backgroundThread = new Thread(() => + { + while (!termination.IsSet) + { + Thread.Sleep(10); + // A DprServer has built-in timers to rate-limit checkpoints and refreshes if needed + dprWorker.TryRefreshAndCheckpoint(100, 10); + } + }); + backgroundThread.Start(); + + // Allocate some memory buffers for a sequential, custom-built wire protocol for our CounterServer. + // DPR is not a net work protocol, although it expects some help from the host system to pass information + // around. + var inBuffer = new byte[1 << 15]; + var outBuffer = new byte[1 << 15]; + // A simple, sequential, blocking server implementation. + socket.Listen(512); + while (!termination.IsSet) + { + Socket conn; + try + { + conn = socket.Accept(); + } + catch (SocketException) + { + return; + } + + var receivedBytes = 0; + // Our protocol first reads a size field of the combined DPR header + messages + while (receivedBytes < sizeof(int)) + receivedBytes += conn.Receive(inBuffer, receivedBytes, inBuffer.Length - receivedBytes, + SocketFlags.None); + + var size = BitConverter.ToInt32(inBuffer); + // Receive the combined message. + while (receivedBytes < size + sizeof(int)) + receivedBytes += conn.Receive(inBuffer, receivedBytes, inBuffer.Length - receivedBytes, + SocketFlags.None); + + // We can obtain the DPR header by computing the size information + var request = new ReadOnlySpan(inBuffer, sizeof(int), size - sizeof(int)); + + var responseBuffer = new Span(outBuffer, sizeof(int), outBuffer.Length - sizeof(int)); + + long result = 0; + // Before executing server-side logic, check with DPR to start tracking for the batch and make sure + // we are allowed to execute it. If not, the response header will be populated and we should immediately + // return that to the client side libDPR. + if (dprWorker.ReceiveAndBeginProcessing(request)) + { + // Execute the request batch. In this case, always a single increment operation. + result = dprWorker.StateObject().value; + dprWorker.StateObject().value += + BitConverter.ToInt64(new Span(inBuffer, sizeof(int) + size - sizeof(long), sizeof(long))); + + // Signal the end of execution for DPR to finish up and populate a response header + dprWorker.FinishProcessingAndSend(responseBuffer); + } + else + { + dprWorker.ComposeErrorResponse(request, responseBuffer); + } + + // The server is then free to convey the result back to the client any way it wants, so long as it + // forwards the DPR response header. In this case, we are using the same format as above by concatenating + // the DPR response and our response + BitConverter.TryWriteBytes(new Span(outBuffer, 0, sizeof(int)), + sizeof(long) + DprBatchHeader.FixedLenSize); + BitConverter.TryWriteBytes( + new Span(outBuffer, DprBatchHeader.FixedLenSize + sizeof(int), + outBuffer.Length - DprBatchHeader.FixedLenSize - sizeof(int)), result); + conn.Send(outBuffer, 0, sizeof(int) + DprBatchHeader.FixedLenSize + sizeof(long), SocketFlags.None); + // One socket connection per client for simplicity + conn.Close(); + } + + backgroundThread.Join(); + } + + public void StopServer() + { + socket.Dispose(); + termination.Set(); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/samples/DprCounters/DprCounters/CounterStateObject.cs b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterStateObject.cs new file mode 100644 index 000000000..f97c8c3bf --- /dev/null +++ b/cs/research/libdpr/samples/DprCounters/DprCounters/CounterStateObject.cs @@ -0,0 +1,93 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Net; +using FASTER.libdpr; + +namespace DprCounters +{ + /// + /// StateObject that encapsulates a single atomic counter + /// + /// A counter is an example of a simple state object --- one that does not need the fine grained version control + /// ability exposed by the full StateObject interface for concurrent access performance or (single-node) checkpoint + /// coordination. We can therefore just extend from SimpleStateObject + public sealed class CounterStateObject : IStateObject + { + private string checkpointDirectory; + private ConcurrentDictionary prevCounters = new(); + public long value; + + /// + /// Constructs a new CounterStateObject + /// + /// directory name to write checkpoints to + /// + /// version to start at. If version is not 0, CounterStateObject will attempt to restore + /// state from corresponding checkpoint + /// + public CounterStateObject(string checkpointDirectory) + { + this.checkpointDirectory = checkpointDirectory; + Directory.CreateDirectory(checkpointDirectory); + } + + // With SimpleStateObject, CounterStateObject only needs to implement a single-threaded + // checkpoint scheme. + public void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + if (metadata != null) throw new NotImplementedException(); + // Use a simple naming scheme to associate checkpoints with versions. A more sophisticated scheme may + // store persistent mappings or use other schemes to do so. + var fileName = Path.Join(checkpointDirectory, version.ToString()); + var fs = File.Open(fileName, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None); + + // libDPR will ensure that request batches that are protected with VersionScheme.Enter() and + // VersionScheme.Leave() will not interleave with checkpoint or recovery code. It is therefore safe + // to read and write values without protection in this function + prevCounters[version] = value; + + // Once the content of the checkpoint is established (we have read a current snapshot of value), it is ok + // to write to disk asynchronously and allow other operations to continue. In SimpleStateObject, + // operations are blocked before PerformCheckpoint return. + fs.WriteAsync(BitConverter.GetBytes(value), 0, sizeof(long)).ContinueWith(token => + { + if (!token.IsCompletedSuccessfully) + Console.WriteLine($"Error {token} during checkpoint"); + // We need to invoke onPersist() to inform DPR when a checkpoint is on disk + onPersist(); + fs.Dispose(); + }); + } + + // With SimpleStateObject, CounterStateObject can just implement a single-threaded blocking recovery function + public void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + metadata = null; + // This is for machines that did not physically go down (otherwise they will simply + // load the surviving version on restart). libDPR will additionally never request a worker to restore + // checkpoints earlier than the committed version in the DPR cut. We can therefore rely on a (relatively + // small) stash of in-memory snapshots to quickly handle this call. + if (prevCounters.TryGetValue(version, out value)) return; + + var fileName = Path.Join(checkpointDirectory, version.ToString()); + using var fs = File.Open(fileName, FileMode.Open, FileAccess.Read, FileShare.None); + + var bytes = new byte[sizeof(long)]; + fs.Read(bytes, 0, sizeof(long)); + value = BitConverter.ToInt64(bytes, 0); + } + + public void PruneVersion(long version) + { + prevCounters.TryRemove(version, out _); + } + + public IEnumerable<(byte[], int)> GetUnprunedVersions() + { + return Enumerable.Empty<(byte[], int)>(); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/samples/DprCounters/DprCounters/DprCounters.csproj b/cs/research/libdpr/samples/DprCounters/DprCounters/DprCounters.csproj new file mode 100644 index 000000000..2a9cd2b97 --- /dev/null +++ b/cs/research/libdpr/samples/DprCounters/DprCounters/DprCounters.csproj @@ -0,0 +1,14 @@ + + + + Exe + net7.0 + latest + true + + + + + + + diff --git a/cs/research/libdpr/samples/DprCounters/DprCounters/Program.cs b/cs/research/libdpr/samples/DprCounters/DprCounters/Program.cs new file mode 100644 index 000000000..c2d6f9e59 --- /dev/null +++ b/cs/research/libdpr/samples/DprCounters/DprCounters/Program.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Security.Cryptography; +using System.Threading; +using System.Threading.Tasks; +using FASTER.core; +using FASTER.libdpr; + +namespace DprCounters +{ + class Program + { + public static async Task Main(string[] args) + { + // Use a simple pair of in-memory storage to back our DprFinder server for now. Start a local DPRFinder + // server for the cluster + var localDevice1 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var localDevice2 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var device = new PingPongDevice(localDevice1, localDevice2); + using var dprFinderServer = new RespGraphDprFinderServer("127.0.0.1", 15721, new GraphDprFinderBackend(device)); + dprFinderServer.StartServer(); + + // Start two counter servers + var cluster = new Dictionary(); + + var w0 = new WorkerId(0); + cluster.Add(w0, new IPEndPoint(IPAddress.Parse("127.0.0.1"), 15722)); + var w0Server = new CounterServer("127.0.0.1", 15722, new WorkerId(0), "worker0/", + new RespGraphDprFinder("127.0.0.1", 15721)); + var w0Thread = new Thread(w0Server.RunServer); + w0Thread.Start(); + + + var w1 = new WorkerId(1); + cluster.Add(w1, new IPEndPoint(IPAddress.Parse("127.0.0.1"), 15723)); + var w1Server = new CounterServer("127.0.0.1", 15723, new WorkerId(1), "worker1/", + new RespGraphDprFinder("127.0.0.1", 15721)); + var w1Thread = new Thread(w1Server.RunServer); + w1Thread.Start(); + + + // Start a client that performs some operations + var client = new CounterClient(new RespGraphDprFinder("127.0.0.1", 15721), cluster); + var op0 = client.Increment(w0, 42, out _); + var op1 = client.Increment(w1, 2, out _); + var op2 = client.Increment(w1, 7, out _); + var op3 = client.Increment(w0, 10, out _); + await client.Committed(op3); + + // Shutdown + w0Server.StopServer(); + w0Thread.Join(); + + w1Server.StopServer(); + w1Thread.Join(); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/Defs.cs b/cs/research/libdpr/src/FASTER.libdpr/Defs.cs new file mode 100644 index 000000000..f0ddb0292 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/Defs.cs @@ -0,0 +1,168 @@ +using System; + +namespace FASTER.libdpr +{ + /// + /// A worker in the system manipulates uniquely exactly one state object. + /// + public struct DprWorkerId : IEquatable + { + public static readonly DprWorkerId INVALID = new DprWorkerId(-1); + + /// + /// globally-unique worker ID within a DPR cluster + /// + public readonly long guid; + + /// + /// Constructs a worker with the given guid + /// + /// worker guid + public DprWorkerId(long guid) + { + this.guid = guid; + } + + public readonly bool Equals(DprWorkerId other) + { + return guid == other.guid; + } + + public static bool operator ==(DprWorkerId left, DprWorkerId right) + { + return left.Equals(right); + } + + public static bool operator !=(DprWorkerId left, DprWorkerId right) + { + return !left.Equals(right); + } + + /// + public override bool Equals(object obj) + { + return obj is DprWorkerId other && Equals(other); + } + + /// + public override int GetHashCode() + { + return guid.GetHashCode(); + } + } + + /// + /// A worker-version is a tuple of worker and checkpoint version. + /// + public struct WorkerVersion : IEquatable + { + /// + /// Worker + /// + public DprWorkerId DprWorkerId { get; set; } + + /// + /// Version + /// + public long Version { get; set; } + + /// + /// Constructs a new worker version object with given parameters + /// + /// worker + /// version + public WorkerVersion(DprWorkerId dprWorkerId, long version) + { + DprWorkerId = dprWorkerId; + Version = version; + } + + internal WorkerVersion(long worker, long version) : this(new DprWorkerId(worker), version) + { + } + + public static bool operator ==(WorkerVersion left, WorkerVersion right) + { + return left.Equals(right); + } + + public static bool operator !=(WorkerVersion left, WorkerVersion right) + { + return !left.Equals(right); + } + + public bool Equals(WorkerVersion other) + { + return DprWorkerId.Equals(other.DprWorkerId) && Version == other.Version; + } + + /// + public override bool Equals(object obj) + { + return obj is WorkerVersion other && Equals(other); + } + + /// + public override int GetHashCode() + { + unchecked + { + return (DprWorkerId.GetHashCode() * 397) ^ Version.GetHashCode(); + } + } + } + + /// + /// Speculation Unit ID + /// + public struct SUId : IEquatable + { + /// + /// The EXTERNAL SU is a special SU that can be used in either messages or workers. EXTERNAL messages are always + /// consumed only after commit. EXTERNAL workers wait until committed to consume any message. + /// + public static readonly SUId EXTERNAL = new SUId(-1); + + /// + /// globally-unique worker ID within a DPR cluster + /// + public readonly long guid; + + /// + /// Constructs a worker with the given guid + /// + /// worker guid + public SUId(long guid) + { + this.guid = guid; + } + + public bool Equals(SUId other) + { + return guid == other.guid; + } + + public static bool operator ==(SUId left, SUId right) + { + return left.Equals(right); + } + + public static bool operator !=(SUId left, SUId right) + { + return !left.Equals(right); + } + + /// + public override bool Equals(object obj) + { + return obj is SUId other && Equals(other); + } + + /// + public override int GetHashCode() + { + return guid.GetHashCode(); + } + } +} + diff --git a/cs/research/libdpr/src/FASTER.libdpr/DprMessageBuffer.cs b/cs/research/libdpr/src/FASTER.libdpr/DprMessageBuffer.cs new file mode 100644 index 000000000..1c35ca442 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/DprMessageBuffer.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using FASTER.core; + +namespace FASTER.libdpr +{ + // internal class DprMessageBuffer + // { + // private ConcurrentDictionary)> queues; + // private List toDelete; + // + // public DprMessageBuffer() + // { + // queues = new ConcurrentDictionary)>(); + // toDelete = new List(); + // } + // + // public void Buffer(ref DprMessageHeader header, Action item) + // { + // var wv = new WorkerVersion(header.SrcWorkerId, header.Version); + // var wl = header.worldLine; + // var entry = queues.GetOrAdd(wv, _ => ValueTuple.Create(wl, new ConcurrentQueue())); + // entry.Item2.Enqueue(item); + // } + // + // public void ProcessBuffer(IDprFinder dprFinder) + // { + // toDelete.Clear(); + // foreach (var (wv, entry) in queues) + // { + // switch (dprFinder.CheckStatus(entry.Item1, wv)) + // { + // case DprStatus.COMMITTED: + // while (!entry.Item2.TryDequeue(out var a)) a(); + // toDelete.Add(wv); + // break; + // case DprStatus.SPECULATIVE: + // break; + // case DprStatus.ROLLEDBACK: + // toDelete.Add(wv); + // break; + // default: + // throw new ArgumentOutOfRangeException(); + // } + // } + // + // + // foreach (var wv in toDelete) + // queues.TryRemove(wv, out _); + // } + // } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/DprMessageHeader.cs b/cs/research/libdpr/src/FASTER.libdpr/DprMessageHeader.cs new file mode 100644 index 000000000..ee4eb507b --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/DprMessageHeader.cs @@ -0,0 +1,25 @@ +using System; +using System.Runtime.InteropServices; + +namespace FASTER.libdpr +{ + /// + /// DPR metadata associated with each batch. Laid out continuously as: + /// header | deps (WorkerVersion[]) | versionTracking (long[]) + /// + [StructLayout(LayoutKind.Explicit, Size = 36)] + public unsafe struct DprMessageHeader + { + public const int FixedLenSize = 36; + public const string GprcMetadataKeyName = "DprHeader-bin"; + [FieldOffset(0)] internal fixed byte data[FixedLenSize]; + [FieldOffset(0)] internal DprWorkerId SrcWorkerId; + [FieldOffset(8)] internal SUId SrcSU; + // a batch should always consist of messages from the same world-lines on the client side. + // We can artificially write the servers to not write reply batches with more than one world-line. + [FieldOffset(16)] internal long WorldLine; + [FieldOffset(24)] internal long Version; + [FieldOffset(32)] internal int NumClientDeps; + internal int ClientDepsOffset => FixedLenSize; + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/DprSession.cs b/cs/research/libdpr/src/FASTER.libdpr/DprSession.cs new file mode 100644 index 000000000..4730323eb --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/DprSession.cs @@ -0,0 +1,199 @@ +using System; +using System.Diagnostics; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.libdpr +{ + public class DprSessionRolledBackException : Exception + { + public readonly long NewWorldLine; + + public DprSessionRolledBackException(long newWorldLine) + { + NewWorldLine = newWorldLine; + } + } + + /// + /// A DprSession is a DPR entity that cannot commit/restore state, but communicates with other DPR entities and may + /// convey DPR dependencies (e.g., a client session). + /// + public class DprSession + { + internal long version, worldLine; + internal LightDependencySet deps; + + /// + /// WorldLine of the session + /// + public long WorldLine => worldLine >= 0 ? worldLine : -worldLine; + + public bool RolledBack => worldLine < 0; + + /// + /// Create a DPR session working on the supplied worldLine (or 1 by default, in a cluster that has never failed) + /// + /// the worldLine to start at, or 0 (wildcard that matches to the first received message's worldline) by default + public DprSession(long initialWorldLine = 0) + { + version = 1; + // 0 denotes that the session does not yet exist in a worldline + worldLine = initialWorldLine; + deps = new LightDependencySet(); + } + + + internal void UnsafeReset(long initialWorldLine = 0) + { + version = 1; + // 0 denotes that the session does not yet exist in a worldline + worldLine = initialWorldLine; + deps.UnsafeClear(); + } + + + internal void UnsafeReset(StateObject to) + { + version = to.Version(); + worldLine = to.WorldLine(); + deps.UnsafeClear(); + deps.Update(to.Me(), version); + } + + /// + /// Obtain a DPR header that encodes session dependency for an outgoing message + /// + /// byte array to write header into + /// size of the header, or negative of the required size to fit if supplied header is to small + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe int TagMessage(Span headerBytes) + { + if (RolledBack) + throw new DprSessionRolledBackException(WorldLine); + + fixed (byte* b = headerBytes) + { + var bend = b + headerBytes.Length; + ref var dprHeader = ref Unsafe.AsRef(b); + + // Populate header with relevant request information + if (headerBytes.Length >= DprMessageHeader.FixedLenSize) + { + dprHeader.SrcWorkerId = DprWorkerId.INVALID; + dprHeader.WorldLine = worldLine; + dprHeader.Version = version; + dprHeader.NumClientDeps = 0; + } + + // Populate tracking information into the batch + var copyHead = b + dprHeader.ClientDepsOffset; + foreach (var wv in deps) + { + dprHeader.NumClientDeps++; + // only copy if it fits + if (copyHead < bend - sizeof(WorkerVersion)) + Unsafe.AsRef(copyHead) = wv; + copyHead += sizeof(WorkerVersion); + } + + // Invert depends on whether or not we fit + return (int)(copyHead <= bend ? copyHead - b : b - copyHead); + } + } + + /// + /// Receive a message with the given header in this session. + /// + /// DPR header of the message to receive + /// version of the message + /// status of the batch. If status is ROLLBACK, this session must be rolled-back + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe bool Receive(ReadOnlySpan dprMessage) + { + if (RolledBack) + throw new DprSessionRolledBackException(WorldLine); + + fixed (byte* h = dprMessage) + { + ref var responseHeader = ref Unsafe.AsRef(h); + if (worldLine == 0) + Interlocked.CompareExchange(ref worldLine, responseHeader.WorldLine, 0); + + var wl = worldLine; + if (responseHeader.WorldLine > wl) + { + Interlocked.CompareExchange(ref worldLine, -responseHeader.WorldLine, wl); + throw new DprSessionRolledBackException(WorldLine); + } + + if (responseHeader.WorldLine < worldLine) + return false; + + Debug.Assert(responseHeader.WorldLine == worldLine); + + // Add largest worker-version as dependency for future ops + if (!responseHeader.SrcWorkerId.Equals(DprWorkerId.INVALID)) + deps.Update(responseHeader.SrcWorkerId, responseHeader.Version); + else + { + fixed (byte* d = responseHeader.data) + { + var depsHead = d + responseHeader.ClientDepsOffset; + for (var i = 0; i < responseHeader.NumClientDeps; i++) + { + ref var wv = ref Unsafe.AsRef(depsHead); + deps.Update(wv.DprWorkerId, wv.Version); + depsHead += sizeof(WorkerVersion); + } + } + } + + // Update versioning information + core.Utility.MonotonicUpdate(ref this.version, responseHeader.Version, out _); + } + + return true; + } + + public bool DependOn(StateObject so) + { + var version = so.Version(); + var wl = so.WorldLine(); + if (worldLine == 0) + Interlocked.CompareExchange(ref worldLine, wl, 0); + if (worldLine < wl) throw new DprSessionRolledBackException(wl); + if (worldLine > wl) return false; + deps.Update(so.Me(), version); + return true; + } + + + // TODO(Tianyu): Need to find a way for long-running sessions to prune its dependencies + // Not safe to invoke concurrently with other methods on this session + public async Task SpeculationBarrier(IDprFinder dprFinder, bool autoRefresh = false) + { + while (true) + { + if (autoRefresh) + dprFinder.RefreshStateless(); + if (worldLine != 0 && dprFinder.SystemWorldLine() != worldLine) + { + worldLine = -dprFinder.SystemWorldLine(); + throw new DprSessionRolledBackException(WorldLine); + } + + if (deps.All(wv => dprFinder.SafeVersion(wv.DprWorkerId) >= wv.Version)) + { + deps.UnsafeClear(); + return; + } + + // TODO(Tianyu): Fix busy wait + await Task.Yield(); + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/DprWorkerOptions.cs b/cs/research/libdpr/src/FASTER.libdpr/DprWorkerOptions.cs new file mode 100644 index 000000000..23b4fa043 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/DprWorkerOptions.cs @@ -0,0 +1,10 @@ +namespace FASTER.libdpr +{ + public class DprWorkerOptions + { + public DprWorkerId Me; + public IDprFinder DprFinder = null; + public long CheckpointPeriodMilli = 5; + public long RefreshPeriodMilli = 5; + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/FASTER.libdpr.csproj b/cs/research/libdpr/src/FASTER.libdpr/FASTER.libdpr.csproj new file mode 100644 index 000000000..76ccfe79f --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/FASTER.libdpr.csproj @@ -0,0 +1,42 @@ + + + + net7.0 + true + latest + AnyCPU;x64 + true + true + Microsoft + Microsoft + This is a pre-release test build for libDPR, a project under Microsoft's FASTER + © Microsoft Corporation. All rights reserved. + MIT + https://github.com/microsoft/FASTER + https://github.com/microsoft/FASTER + git + FASTER + See the project website at https://github.com/microsoft/FASTER for more details + false + 1.0.1 + + + + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + diff --git a/cs/research/libdpr/src/FASTER.libdpr/IStateObjectAttachment.cs b/cs/research/libdpr/src/FASTER.libdpr/IStateObjectAttachment.cs new file mode 100644 index 000000000..2cf896c2f --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/IStateObjectAttachment.cs @@ -0,0 +1,30 @@ +using System; + +namespace FASTER.libdpr +{ + /// + /// A StateObjectAttachment is a piece of data structure that can be checkpointed and recovered with state object + /// content when attached to a DprWorker. The content is guaranteed to be consistent with StateObject if + /// modification of the attachment only happens within protected processing blocks in DprWorker. DprWorker + /// guarantees that these functions are invoked single-threaded and will not interleave with protected processing + /// blocks. + /// + public interface IStateObjectAttachment + { + /// + /// The size of the attachment when serialized, in bytes + int SerializedSize(); + + /// + /// Serializes the attachment to the given buffer, which is guaranteed to be at least as large as SerializedSize() + /// + /// The buffer to serialize to + void SerializeTo(Span buffer); + + /// + /// Recover attachment state from the given bytes + /// + /// serialized bytes + void RecoverFrom(ReadOnlySpan serialized); + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/LightDependencySet.cs b/cs/research/libdpr/src/FASTER.libdpr/LightDependencySet.cs new file mode 100644 index 000000000..ecbe27231 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/LightDependencySet.cs @@ -0,0 +1,109 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.libdpr +{ + /// + /// Class used to inexpensively track worker version dependency for a workers and client sessions. + /// Can only correctly track dependencies within a cluster up to size MaxClusterSize + /// + public sealed class LightDependencySet : IEnumerable + { + private const int MaxSizeBits = 8; + + /** + * The maximum number of workers in a cluster this light dependency set can support. Compile-time constant. + */ + public const int MaxClusterSize = 1 << MaxSizeBits; + + private const long NoDependency = -1; + private readonly long[] dependentVersions; + + /// + /// Constructs a new light dependency set + /// + public LightDependencySet() + { + dependentVersions = new long[1 << MaxSizeBits]; + for (var i = 0; i < dependentVersions.Length; i++) + dependentVersions[i] = NoDependency; + } + + /// + public IEnumerator GetEnumerator() + { + return new LightDependencySetEnumerator(this); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + /// + /// Add dependency of (worker, version) + /// + /// worker + /// version + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Update(DprWorkerId dprWorkerId, long version) + { + ref var originalVersion = ref dependentVersions[dprWorkerId.guid]; + core.Utility.MonotonicUpdate(ref originalVersion, version, out _); + } + + /// + /// Removes the dependency of (worker, version) and all previous versions of the worker if present + /// + /// worker + /// version + /// whether the dependency was removed + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryRemove(DprWorkerId dprWorkerId, long version) + { + ref var originalVersion = ref dependentVersions[dprWorkerId.guid]; + return Interlocked.CompareExchange(ref originalVersion, NoDependency, version) >= version; + } + + internal void UnsafeClear() + { + for (var i = 0; i < dependentVersions.Length; i++) + dependentVersions[i] = NoDependency; + } + + private class LightDependencySetEnumerator : IEnumerator + { + private readonly LightDependencySet dependencySet; + private int index = -1; + + public LightDependencySetEnumerator(LightDependencySet dependencySet) + { + this.dependencySet = dependencySet; + } + + public bool MoveNext() + { + while (++index < MaxClusterSize) + if (dependencySet.dependentVersions[index] != NoDependency) + return true; + return false; + } + + public void Reset() + { + index = -1; + } + + public WorkerVersion Current => new WorkerVersion(index, dependencySet.dependentVersions[index]); + + object IEnumerator.Current => Current; + + public void Dispose() + { + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/SerializationUtil.cs b/cs/research/libdpr/src/FASTER.libdpr/SerializationUtil.cs new file mode 100644 index 000000000..555a02804 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/SerializationUtil.cs @@ -0,0 +1,122 @@ +using System; +using System.Collections; +using System.Collections.Generic; + +namespace FASTER.libdpr +{ + public static class SerializationUtil + { + internal static unsafe int SerializeCheckpointMetadata(Span buffer, long worldLine, + WorkerVersion checkpointed, IEnumerable deps) + { + fixed (byte* b = buffer) + { + var end = b + buffer.Length; + var head = b; + if (!BitConverter.TryWriteBytes(new Span(head, (int) (end - head)), worldLine)) return 0; + head += sizeof(long); + if (!BitConverter.TryWriteBytes(new Span(head, (int) (end - head)), checkpointed.DprWorkerId.guid)) + return 0; + head += sizeof(long); + if (!BitConverter.TryWriteBytes(new Span(head, (int) (end - head)), checkpointed.Version)) + return 0; + head += sizeof(long); + // skip 4 bytes of size field for now; + var sizeField = (int *) head; + if ((int) (end - head) < sizeof(int)) return 0; + head += sizeof(int); + var numDeps = 0; + foreach (var wv in deps) + { + numDeps++; + head += sizeof(long); + if (!BitConverter.TryWriteBytes(new Span(head, (int) (end - head)), wv.DprWorkerId.guid)) return 0; + head += sizeof(long); + if (!BitConverter.TryWriteBytes(new Span(head, (int) (end - head)), wv.Version)) return 0; + } + + *sizeField = numDeps; + return (int) (head - b); + } + } + + public static unsafe int DeserializeCheckpointMetadata(ReadOnlySpan buffer, out long worldLine, + out WorkerVersion checkpointed, out IEnumerable deps) + { + fixed (byte* b = buffer) + { + var head = b; + worldLine = *(long *) head; + head += sizeof(long); + var worker = *(long *) head; + head += sizeof(long); + var version = *(long *) head; + head += sizeof(long); + checkpointed = new WorkerVersion(worker, version); + var d = new EnumerableSerializedDeps(head); + deps = d; + return (int) (d.GetDepsTail() - b); + } + } + + internal unsafe class EnumerableSerializedDeps : IEnumerable + { + private readonly byte* head; + + public EnumerableSerializedDeps(byte* head) + { + this.head = head; + } + + public byte* GetDepsTail() + { + var numDeps = *(int*) head; + return head + sizeof(int) + 2 * sizeof(long) * numDeps; + } + + public IEnumerator GetEnumerator() + { + return new Enumerator(head); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + private class Enumerator : IEnumerator + { + private readonly byte* head; + private readonly int size; + private int ptr; + + public Enumerator(byte* head) + { + this.head = head; + size = *(int *) head; + ptr = sizeof(int) - sizeof(long) * 2; + } + + public bool MoveNext() + { + ptr += sizeof(long) * 2; + return ptr < size; + } + + public void Reset() + { + ptr = sizeof(int) - sizeof(long) * 2; + } + + public WorkerVersion Current => + new WorkerVersion(*(long*) (head + ptr), *(long*) (head + ptr + sizeof(long))); + + object IEnumerator.Current => Current; + + public void Dispose() + { + } + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/StateObject.cs b/cs/research/libdpr/src/FASTER.libdpr/StateObject.cs new file mode 100644 index 000000000..a69d975e1 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/StateObject.cs @@ -0,0 +1,716 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; +using FASTER.common; +using FASTER.core; +using Google.Protobuf; + +namespace FASTER.libdpr +{ + /// + /// A DprWorker corresponds to an individual stateful failure domain (e.g., a physical machine or VM) in the system. + /// DprWorkers have access to some external persistent storage and can commit and restore state through it using the + /// StateObject implementation. + /// + /// type of state object + public abstract class StateObject : IDisposable + { + private readonly SimpleObjectPool dependencySetPool; + public readonly DprWorkerOptions options; + + private readonly ConcurrentDictionary versions; + protected readonly IVersionScheme versionScheme; + private long worldLine = 1; + + private long lastCheckpointMilli, lastRefreshMilli; + private Stopwatch sw = Stopwatch.StartNew(); + + private readonly byte[] depSerializationArray; + private TaskCompletionSource nextCommit; + + private List attachments = new List(); + private byte[] metadataBuffer = new byte[1 << 15]; + + private SimpleObjectPool sessionPool; + private bool connected; + + private long largestRequestedCheckpointVersion = -1; + private SemaphoreSlim rateLimiter = new SemaphoreSlim(Environment.ProcessorCount, Environment.ProcessorCount); + + private class CheckpointStateMachine : VersionSchemeStateMachine + { + internal const byte IN_PROG = 1; + private bool checkpointComplete = false; + private StateObject so; + + public CheckpointStateMachine(StateObject so, long targetVersion = -1) : base(targetVersion) + { + this.so = so; + } + + public override bool GetNextStep(VersionSchemeState currentState, out VersionSchemeState nextState) + { + switch (currentState.Phase) + { + case VersionSchemeState.REST: + nextState = VersionSchemeState.Make(IN_PROG, actualToVersion); + return true; + case IN_PROG: + nextState = VersionSchemeState.Make(VersionSchemeState.REST, actualToVersion); + return checkpointComplete; + default: + throw new NotImplementedException(); + } + } + + public override void OnEnteringState(VersionSchemeState fromState, VersionSchemeState toState) + { + if (fromState.Phase == VersionSchemeState.REST) + { + // Prepare checkpoint metadata + int length; + var deps = so.ComputeCheckpointMetadata(fromState.Version); + Debug.Assert(so.MetadataSize(deps) < so.metadataBuffer.Length); + unsafe + { + fixed (byte* dst = so.metadataBuffer) + { + var head = dst; + var end = dst + so.metadataBuffer.Length; + deps.CopyTo(new Span(head, (int)(end - head))); + head += deps.Length; + + *(int*)head = so.attachments.Count; + head += sizeof(int); + + foreach (var attachment in so.attachments) + { + var size = attachment.SerializedSize(); + *(int*)head = size; + head += sizeof(int); + attachment.SerializeTo(new Span(head, (int)(end - head))); + head += size; + } + + length = (int)(head - dst); + } + } + + // Perform checkpoint with a callback to report persistence and clean-up leftover tracking state + so.PerformCheckpoint(fromState.Version, new Span(so.metadataBuffer, 0, length), () => + { + so.versions.TryRemove(fromState.Version, out var deps); + var workerVersion = new WorkerVersion(so.options.Me, fromState.Version); + so.options.DprFinder?.ReportNewPersistentVersion(so.worldLine, workerVersion, deps); + so.dependencySetPool.Return(deps); + checkpointComplete = true; + so.versionScheme.SignalStepAvailable(); + }); + + // Prepare new version before any operations can occur in it + var newDeps = so.dependencySetPool.Checkout(); + if (fromState.Version != 0) newDeps.Update(so.options.Me, fromState.Version); + var success = so.versions.TryAdd(toState.Version, newDeps); + Debug.Assert(success); + } + } + + public override void AfterEnteringState(VersionSchemeState state) + { + } + } + + /// + /// Creates a new DprServer. + /// + /// underlying state object + /// DPR worker options + // TODO(Tianyu): Put some design work into the different operating modes of applications written this way -- speculative/pessimistic/no guarantees + public StateObject(IVersionScheme versionScheme, DprWorkerOptions options) + { + this.options = options; + this.versionScheme = versionScheme; + + versions = new ConcurrentDictionary(); + dependencySetPool = new SimpleObjectPool(() => new LightDependencySet()); + depSerializationArray = new byte[2 * LightDependencySet.MaxClusterSize * sizeof(long)]; + nextCommit = new TaskCompletionSource(); + sessionPool = new SimpleObjectPool(() => new DprSession()); + } + + public IDprFinder GetDprFinder() => options.DprFinder; + + /// + /// A task that completes when the next commit is recoverable + public Task NextCommit() => nextCommit.Task; + + /// + /// Add the given attachment to the DprWorker. Should only be invoked before connecting to the cluster. + /// + /// the attachment to add + public void AddAttachment(IStateObjectAttachment attachment) + { + attachments.Add(attachment); + } + + /// + /// Worker ID of this DprServer instance + public DprWorkerId Me() => options.Me; + + // TODO: The following two methods are technically only meaningful under protection + /// + /// WorldLine of current DprWorker + public long WorldLine() => worldLine; + + /// + /// Version of current DprWorker + public long Version() => versionScheme.CurrentState().Version; + + public bool ConnectedToCluster() => connected; + + private Task BeginRestore(long newWorldLine, long version) + { + var tcs = new TaskCompletionSource(); + // Restoration to this particular worldline has already been completed + if (worldLine >= newWorldLine) return Task.CompletedTask; + + versionScheme.TryAdvanceVersionWithCriticalSection((vOld, vNew) => + { + // Restore underlying state object state + RestoreCheckpoint(version, out var metadata); + // Use the restored metadata to restore attachments state + unsafe + { + fixed (byte* src = metadata) + { + var head = src + + SerializationUtil.DeserializeCheckpointMetadata(metadata, out _, out _, out _); + var numAttachments = *(int*)head; + head += sizeof(int); + Debug.Assert(numAttachments == attachments.Count, + "recovered checkpoint contains a different number of attachments!"); + foreach (var attachment in attachments) + { + var size = *(int*)head; + head += sizeof(int); + attachment.RecoverFrom(new Span(head, size)); + head += size; + } + } + } + + // Clear any leftover state and signal complete + versions.Clear(); + var deps = dependencySetPool.Checkout(); + if (vOld != 0) + deps.Update(options.Me, vOld); + var success = versions.TryAdd(vNew, deps); + Debug.Assert(success); + tcs.SetResult(null); + worldLine = newWorldLine; + }, Math.Max(version, versionScheme.CurrentState().Version) + 1); + + return tcs.Task; + } + + internal int MetadataSize(ReadOnlySpan deps) + { + var result = deps.Length + sizeof(int); + foreach (var attachment in attachments) + result += sizeof(int) + attachment.SerializedSize(); + return result; + } + + private bool BeginCheckpoint(long targetVersion = -1) + { + if (versionScheme.CurrentState().Phase != VersionSchemeState.REST) return false; + if (versionScheme.TryExecuteStateMachine(new CheckpointStateMachine(this, targetVersion)) == + StateMachineExecutionStatus.OK) + { + core.Utility.MonotonicUpdate(ref lastCheckpointMilli, sw.ElapsedMilliseconds, out _); + return true; + } + + return false; + } + + /// + /// At the start (restart) of processing, connect to the rest of the DPR cluster. If the worker restarted from + /// an existing instance, the cluster will detect this and trigger rollback as appropriate across the cluster, + /// and the worker will automatically load the correct checkpointed state for recovery. Must be invoked exactly + /// once before any other operations. + /// + public void ConnectToCluster(out bool restored) + { + if (connected) + throw new InvalidOperationException("Cannot connect to a cluster twice"); + long versionToRecover = 0; + if (options.DprFinder != null) + { + versionToRecover = options.DprFinder.AddWorker(options.Me, GetUnprunedVersions); + } + else + { + foreach (var v in GetUnprunedVersions()) + { + SerializationUtil.DeserializeCheckpointMetadata(v.Span, out _, out var wv, out _); + if (wv.Version > versionToRecover) + versionToRecover = wv.Version; + } + } + + // This worker is recovering from some failure and we need to load said checkpoint + restored = versionToRecover != 0; + if (restored) + BeginRestore(options.DprFinder?.SystemWorldLine() ?? 1, versionToRecover).GetAwaiter().GetResult(); + else + { + var deps = dependencySetPool.Checkout(); + var success = versions.TryAdd(1, deps); + Debug.Assert(success); + } + + options.DprFinder?.Refresh(options.Me, GetUnprunedVersions); + connected = true; + } + + internal ReadOnlySpan ComputeCheckpointMetadata(long version) + { + var deps = versions[version]; + var size = SerializationUtil.SerializeCheckpointMetadata(depSerializationArray, + worldLine, new WorkerVersion(options.Me, version), deps); + Debug.Assert(size > 0); + return new ReadOnlySpan(depSerializationArray, 0, size); + } + + /// + /// Get the largest version number that is considered committed (will be recovered to) of this DPR Worker + public long CommittedVersion() + { + return options.DprFinder?.SafeVersion(Me()) ?? Version() - 1; + } + + public void Refresh() + { + var currentTime = sw.ElapsedMilliseconds; + var lastCommitted = CommittedVersion(); + + if (options.DprFinder != null && lastRefreshMilli + options.RefreshPeriodMilli < currentTime) + { + // A false return indicates that the DPR finder does not have a cut available, this is usually due to + // restart from crash, at which point we should resend the graph + options.DprFinder.Refresh(options.Me, GetUnprunedVersions); + core.Utility.MonotonicUpdate(ref lastRefreshMilli, currentTime, out _); + if (worldLine != options.DprFinder.SystemWorldLine()) + BeginRestore(options.DprFinder.SystemWorldLine(), options.DprFinder.SafeVersion(options.Me)) + .GetAwaiter().GetResult(); + } + + if (lastCheckpointMilli + options.CheckpointPeriodMilli <= currentTime) + { + // TODO(Tianyu): Should avoid unnecessarily performing a checkpoint when underlying state object has not changed + // TODO(Tianyu): Study when to fast-forward a version by more than one + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, + versionScheme.CurrentState().Version + 1, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + } + + // Can prune dependency information of committed versions + var newCommitted = CommittedVersion(); + if (lastCommitted != newCommitted) + { + var oldTask = nextCommit; + nextCommit = new TaskCompletionSource(); + oldTask.SetResult(newCommitted); + } + + for (var i = lastCommitted; i < newCommitted; i++) + if (i != 0) + PruneVersion(i); + } + + private unsafe void UpdateDeps(ReadOnlySpan headerBytes) + { + ref var header = + ref MemoryMarshal.GetReference(MemoryMarshal.Cast(headerBytes)); + // Update batch dependencies to the current worker-version. This is an over-approximation, as the batch + // could get processed at a future version instead due to thread timing. However, this is not a correctness + // issue, nor do we lose much precision as batch-level dependency tracking is already an approximation. + var deps = versions[versionScheme.CurrentState().Version]; + if (!header.SrcWorkerId.Equals(DprWorkerId.INVALID)) + deps.Update(header.SrcWorkerId, header.Version); + unsafe + { + fixed (byte* d = header.data) + { + var depsHead = d + header.ClientDepsOffset; + for (var i = 0; i < header.NumClientDeps; i++) + { + ref var wv = ref Unsafe.AsRef(depsHead); + deps.Update(wv.DprWorkerId, wv.Version); + depsHead += sizeof(WorkerVersion); + } + } + } + } + + private (long, long) GetWorldLineAndVersion(ReadOnlySpan headerBytes) + { + ref var header = + ref MemoryMarshal.GetReference(MemoryMarshal.Cast(headerBytes)); + return (header.WorldLine, header.Version); + } + + + public async ValueTask TryReceiveAndStartActionAsync(byte[] headerBytes, + LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + var (wl, v) = GetWorldLineAndVersion(headerBytes); + + // Apply the commit ordering rule, taking checkpoints if necessary. + if (v > versionScheme.CurrentState().Version) + { + await rateLimiter.WaitAsync(); + while (v > versionScheme.CurrentState().Version) + { + // TODO(Tianyu): Should provide version that does not take checkpoints on the spot? + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, v, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + Thread.Yield(); + } + + rateLimiter.Release(); + } + + // Enter protected region so the world-line does not shift while we determine whether a message is safe to consume + versionScheme.Enter(context); + // If the worker world-line is behind, wait for worker to recover up to the same point as the client, + // so client operation is not lost in a rollback that the client has already observed. + while (wl > worldLine) + { + versionScheme.Leave(context); + // TODO(Tianyu): Should provide version that does not rollback on the spot? + await BeginRestore(wl, options.DprFinder.SafeVersion(options.Me)); + versionScheme.Enter(context); + } + + // If the worker world-line is newer, the request must be dropped. + if (wl != 0 && wl < worldLine) + { + versionScheme.Leave(); + return false; + } + + UpdateDeps(headerBytes); + return true; + } + + public async ValueTask TryReceiveAndStartActionAsync(ByteString headerBytes, + LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + var (wl, v) = GetWorldLineAndVersion(headerBytes.Span); + + if (v > versionScheme.CurrentState().Version) + { + await rateLimiter.WaitAsync(); + while (v > versionScheme.CurrentState().Version) + { + // TODO(Tianyu): Should provide version that does not take checkpoints on the spot? + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, v, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + Thread.Yield(); + } + + rateLimiter.Release(); + } + + // Enter protected region so the world-line does not shift while we determine whether a message is safe to consume + versionScheme.Enter(context); + // If the worker world-line is behind, wait for worker to recover up to the same point as the client, + // so client operation is not lost in a rollback that the client has already observed. + while (wl > worldLine) + { + versionScheme.Leave(context); + // TODO(Tianyu): Should provide version that does not rollback on the spot? + await BeginRestore(wl, options.DprFinder.SafeVersion(options.Me)); + versionScheme.Enter(context); + } + + // If the worker world-line is newer, the request must be dropped. + if (wl != 0 && wl < worldLine) + { + versionScheme.Leave(); + return false; + } + + UpdateDeps(headerBytes.Span); + return true; + } + + public bool TryReceiveAndStartAction(ReadOnlySpan headerBytes, LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + var (wl, v) = GetWorldLineAndVersion(headerBytes); + + // Apply the commit ordering rule, taking checkpoints if necessary. + while (v > versionScheme.CurrentState().Version) + { + // TODO(Tianyu): Should provide version that does not take checkpoints on the spot? + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, v, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + Thread.Yield(); + } + + // Enter protected region so the world-line does not shift while we determine whether a message is safe to consume + versionScheme.Enter(context); + // If the worker world-line is behind, wait for worker to recover up to the same point as the client, + // so client operation is not lost in a rollback that the client has already observed. + while (wl > worldLine) + { + versionScheme.Leave(context); + // TODO(Tianyu): Should provide version that does not rollback on the spot? + BeginRestore(wl, options.DprFinder.SafeVersion(options.Me)).GetAwaiter().GetResult(); + Thread.Yield(); + versionScheme.Enter(context); + } + + // If the worker world-line is newer, the request must be dropped. + if (wl != 0 && wl < worldLine) + { + versionScheme.Leave(); + return false; + } + + UpdateDeps(headerBytes); + return true; + } + + public unsafe bool TakeOnDependencyAndStartAction(DprSession session, LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + var wl = session.WorldLine; + var v = session.version; + + if (v > versionScheme.CurrentState().Version) + { + while (v > versionScheme.CurrentState().Version) + { + // TODO(Tianyu): Should provide version that does not take checkpoints on the spot? + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, v, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + Thread.Yield(); + } + } + + // Enter protected region so the world-line does not shift while we determine whether a message is safe to consume + versionScheme.Enter(context); + // If the worker world-line is behind, wait for worker to recover up to the same point as the client, + // so client operation is not lost in a rollback that the client has already observed. + while (wl > worldLine) + { + versionScheme.Leave(context); + // TODO(Tianyu): Should provide version that does not rollback on the spot? + BeginRestore(wl, options.DprFinder.SafeVersion(options.Me)).GetAwaiter().GetResult(); + versionScheme.Enter(context); + } + + // If the worker world-line is newer, the request must be dropped. + if (wl != 0 && wl < worldLine) + { + versionScheme.Leave(); + return false; + } + + // Update batch dependencies to the current worker-version. This is an over-approximation, as the batch + // could get processed at a future version instead due to thread timing. However, this is not a correctness + // issue, nor do we lose much precision as batch-level dependency tracking is already an approximation. + var deps = versions[versionScheme.CurrentState().Version]; + foreach (var wv in session.deps) + deps.Update(wv.DprWorkerId, wv.Version); + return true; + } + + public async ValueTask TakeOnDependencyAndStartActionAsync(DprSession session, + LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + var wl = session.WorldLine; + var v = session.version; + + if (v > versionScheme.CurrentState().Version) + { + await rateLimiter.WaitAsync(); + while (v > versionScheme.CurrentState().Version) + { + // TODO(Tianyu): Should provide version that does not take checkpoints on the spot? + core.Utility.MonotonicUpdate(ref largestRequestedCheckpointVersion, v, out _); + BeginCheckpoint(largestRequestedCheckpointVersion); + Thread.Yield(); + } + + rateLimiter.Release(); + } + + // Enter protected region so the world-line does not shift while we determine whether a message is safe to consume + versionScheme.Enter(context); + // If the worker world-line is behind, wait for worker to recover up to the same point as the client, + // so client operation is not lost in a rollback that the client has already observed. + while (wl > worldLine) + { + versionScheme.Leave(context); + // TODO(Tianyu): Should provide version that does not rollback on the spot? + await BeginRestore(wl, options.DprFinder.SafeVersion(options.Me)); + versionScheme.Enter(context); + } + + // If the worker world-line is newer, the request must be dropped. + if (wl != 0 && wl < worldLine) + { + versionScheme.Leave(); + return false; + } + + // Update batch dependencies to the current worker-version. This is an over-approximation, as the batch + // could get processed at a future version instead due to thread timing. However, this is not a correctness + // issue, nor do we lose much precision as batch-level dependency tracking is already an approximation. + var deps = versions[versionScheme.CurrentState().Version]; + foreach (var wv in session.deps) + deps.Update(wv.DprWorkerId, wv.Version); + return true; + } + + public async ValueTask TryMergeAndStartActionAsync(DprSession session, + LightEpoch.EpochContext context = null) + { + var result = await TakeOnDependencyAndStartActionAsync(session, context); + sessionPool.Return(session); + return result; + } + + public void StartLocalAction(LightEpoch.EpochContext context = null) => versionScheme.Enter(context); + + public void EndAction(LightEpoch.EpochContext context = null) => versionScheme.Leave(context); + + public int ProduceTagAndEndAction(Span outputHeaderBytes, LightEpoch.EpochContext context = null) + { + // Should not be interacting with DPR-related things if speculation is disabled + if (options.DprFinder == null) throw new InvalidOperationException(); + + if (outputHeaderBytes.Length < DprMessageHeader.FixedLenSize) + return -DprMessageHeader.FixedLenSize; + + ref var outputHeader = + ref MemoryMarshal.GetReference(MemoryMarshal.Cast(outputHeaderBytes)); + + outputHeader.SrcWorkerId = Me(); + outputHeader.WorldLine = worldLine; + outputHeader.Version = versionScheme.CurrentState().Version; + outputHeader.NumClientDeps = 0; + EndAction(context); + return DprMessageHeader.FixedLenSize; + } + + public DprSession DetachFromWorker() + { + var session = sessionPool.Checkout(); + session.UnsafeReset(this); + return session; + } + + + public DprSession DetachFromWorkerAndPauseAction(LightEpoch.EpochContext context = null) + { + var session = DetachFromWorker(); + EndAction(context); + return session; + } + + public unsafe bool TryMergeAndStartAction(DprSession detachedSession, LightEpoch.EpochContext context = null) + { + var result = TakeOnDependencyAndStartAction(detachedSession, context); + sessionPool.Return(detachedSession); + return result; + } + + + public bool IsCompatible(DprSession detachedSession) + { + return detachedSession.WorldLine == worldLine; + } + + /// + /// Force the execution of a checkpoint ahead of the schedule specified at creation time. + /// Resets the checkpoint schedule to happen checkpoint_milli after this invocation. + /// + /// the version to jump to after the checkpoint, or -1 for the immediate next version + public void ForceCheckpoint(long targetVersion = -1) + { + core.Utility.MonotonicUpdate(ref lastCheckpointMilli, sw.ElapsedMilliseconds, out _); + BeginCheckpoint(targetVersion); + } + + public void ForceRefresh() + { + if (options.DprFinder == null) return; + options.DprFinder.Refresh(options.Me, GetUnprunedVersions); + core.Utility.MonotonicUpdate(ref lastRefreshMilli, sw.ElapsedMilliseconds, out _); + if (worldLine != options.DprFinder.SystemWorldLine()) + BeginRestore(options.DprFinder.SystemWorldLine(), options.DprFinder.SafeVersion(options.Me)) + .GetAwaiter().GetResult(); + } + + /// + /// Performs a checkpoint uniquely identified by the given version number along with the given metadata to be + /// persisted. Implementers are allowed to return as soon as the checkpoint content is finalized, but before + /// it is persistent, but must invoke onPersist afterwards. LibDPR will ensure that this function does not + /// interleave with protected processing logic, other checkpoint requests, or restores. + /// + /// A monotonically increasing unique ID describing this checkpoint > + /// Any metadata, in bytes, to be persisted along with the checkpoint + /// Callback to invoke when checkpoint is persistent + public abstract void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist); + + /// + /// Recovers to a previous checkpoint as identified by the version number, along with any metadata. The function + /// returns only after the state is restored for all future calls. LibDPR will not interleave batch operation, + /// other checkpoint requests, or restore requests with this function. + /// + /// unique ID for the checkpoint to recover > + /// Any metadata, in bytes, persisted along with the checkpoint + public abstract void RestoreCheckpoint(long version, out ReadOnlySpan metadata); + + /// + /// Removes a version from persistent storage. This method is only invoked when a version will no longer be + /// recovered to. + /// + /// unique ID for the checkpoint to remove + public abstract void PruneVersion(long version); + + /// + /// Retrieves information about all unpruned checkpoints on persistent storage, along with persisted metadata. + /// + /// + /// enumerable of bytes that denotes the metadata of each unpruned checkpoint + /// + public abstract IEnumerable> GetUnprunedVersions(); + + public abstract void Dispose(); + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/StateObjectRefreshBackgroundService.cs b/cs/research/libdpr/src/FASTER.libdpr/StateObjectRefreshBackgroundService.cs new file mode 100644 index 000000000..1f45d1054 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/StateObjectRefreshBackgroundService.cs @@ -0,0 +1,59 @@ +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; + +namespace FASTER.libdpr +{ + public class StateObjectRefreshBackgroundService : BackgroundService + { + private ILogger logger; + private CancellationToken stoppingToken; + private Thread refreshThread; + private List stateObjects = new List(); + + public StateObjectRefreshBackgroundService(ILogger logger, + StateObject defaultObject = null) + { + this.logger = logger; + if (defaultObject != null) stateObjects.Add(defaultObject); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + logger?.LogInformation("Refresh background service is starting"); + refreshThread = new Thread(() => + { + while (!stoppingToken.IsCancellationRequested) + { + lock (stateObjects) + { + foreach (var so in stateObjects) + { + // Must not begin refreshing before the state object is connected + if (so.ConnectedToCluster()) + so.Refresh(); + } + } + + Thread.Yield(); + } + + }); + refreshThread.Start(); + await Task.Delay(Timeout.Infinite, this.stoppingToken); + logger?.LogInformation("Refresh background service is winding down"); + refreshThread.Join(); + } + + public void RegisterRefreshTask(StateObject toRegister) + { + lock (stateObjects) + { + if (stoppingToken.IsCancellationRequested) throw new TaskCanceledException(); + stateObjects.Add(toRegister); + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprClientInterceptor.cs b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprClientInterceptor.cs new file mode 100644 index 000000000..4dcb61af6 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprClientInterceptor.cs @@ -0,0 +1,68 @@ +using System; +using System.Diagnostics; +using System.Threading.Tasks; +using FASTER.common; +using Grpc.Core; +using Grpc.Core.Interceptors; + +namespace FASTER.libdpr.gRPC +{ + public class DprClientInterceptor : Interceptor + { + private DprSession session; + private ThreadLocalObjectPool serializationArrayPool; + + public DprClientInterceptor(DprSession session) + { + this.session = session; + serializationArrayPool = new ThreadLocalObjectPool(() => new byte[1 << 10]); + } + + public override TResponse BlockingUnaryCall(TRequest request, + ClientInterceptorContext context, + BlockingUnaryCallContinuation continuation) + { + // TODO(Tianyu): getting headers/trailers apparently unsupported in blocking version of the call + throw new NotImplementedException(); + } + + public override AsyncUnaryCall AsyncUnaryCall(TRequest request, + ClientInterceptorContext context, + AsyncUnaryCallContinuation continuation) + { + var buffer = serializationArrayPool.Checkout(); + session.TagMessage(buffer); + // TODO(Tianyu): Add logic to await for commit if crossing SU + + var headers = context.Options.Headers; + if (headers == null) + { + // TODO(Tianyu): Is this object expensive? + headers = new Metadata(); + var options = context.Options.WithHeaders(headers); + context = new ClientInterceptorContext(context.Method, context.Host, options); + } + + // TODO(Tianyu): Why no span variant? + headers.Add(DprMessageHeader.GprcMetadataKeyName, buffer); + // TODO(Tianyu): Assuming it is ok now to return into object pool? + serializationArrayPool.Return(buffer); + + var call = continuation(request, context); + return new AsyncUnaryCall(HandleTrailer(call.ResponseAsync, call.GetTrailers), + call.ResponseHeadersAsync, call.GetStatus, call.GetTrailers, call.Dispose); + } + + + private async Task HandleTrailer(Task inner, Func getTrailer) + where TResponse : class + { + var result = await inner; + var metadata = getTrailer(); + var header = metadata.GetValueBytes(DprMessageHeader.GprcMetadataKeyName); + Debug.Assert(header != null); + if (session.Receive(header)) return result; + throw new RpcException(Status.DefaultCancelled); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprServerInterceptor.cs b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprServerInterceptor.cs new file mode 100644 index 000000000..ed9a566dc --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprServerInterceptor.cs @@ -0,0 +1,57 @@ +using System.Threading.Tasks; +using FASTER.common; +using Grpc.Core; +using Grpc.Core.Interceptors; +using Status = Grpc.Core.Status; + +namespace FASTER.libdpr.gRPC +{ + public class DprServerInterceptor : Interceptor + { + private StateObject _stateObject; + private ThreadLocalObjectPool serializationArrayPool; + private int requestId; + + // For now, we require that the gRPC integration only works with RwLatchVersionScheme, which supports protected + // blocks that start and end on different threads + // TService is a parameter for DI to only create interceptors after the service is up + public DprServerInterceptor(StateObject stateObject, TService service) + { + this._stateObject = stateObject; + serializationArrayPool = new ThreadLocalObjectPool(() => new byte[1 << 10]); + } + + public override async Task UnaryServerHandler(TRequest request, + ServerCallContext context, + UnaryServerMethod continuation) + { + // TODO(Tianyu): Create Epoch Context specific to a request + + var header = context.RequestHeaders.GetValueBytes(DprMessageHeader.GprcMetadataKeyName); + if (header != null) + { + // Speculative code path + if (!await _stateObject.TryReceiveAndStartActionAsync(header)) + // Use an error to signal to caller that this call cannot proceed + // TODO(Tianyu): add more descriptive exception information + throw new RpcException(Status.DefaultCancelled); + var response = await continuation.Invoke(request, context); + var buf = serializationArrayPool.Checkout(); + _stateObject.ProduceTagAndEndAction(buf); + context.ResponseTrailers.Add(DprMessageHeader.GprcMetadataKeyName, buf); + serializationArrayPool.Return(buf); + return response; + } + else + { + // Non speculative code path + _stateObject.StartLocalAction(); + var response = await continuation.Invoke(request, context); + _stateObject.EndAction(); + // TODO(Tianyu): Allow custom version headers to avoid waiting on, say, a read into a committed value + await _stateObject.NextCommit(); + return response; + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprStatelessServerInterceptor.cs b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprStatelessServerInterceptor.cs new file mode 100644 index 000000000..8ce31e420 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/gRPC/DprStatelessServerInterceptor.cs @@ -0,0 +1,22 @@ +using System.Threading.Tasks; +using Grpc.Core; +using Grpc.Core.Interceptors; + +namespace FASTER.libdpr.gRPC +{ + public class DprStatelessServerInterceptor : Interceptor + { + public override async Task UnaryServerHandler(TRequest request, ServerCallContext context, + UnaryServerMethod continuation) + { + var header = context.RequestHeaders.GetValueBytes(DprMessageHeader.GprcMetadataKeyName); + var response = await continuation.Invoke(request, context); + + if (header != null) + // Simply reflect the dependency information back + context.ResponseTrailers.Add(DprMessageHeader.GprcMetadataKeyName, header); + + return response; + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/DprFinderBase.cs b/cs/research/libdpr/src/FASTER.libdpr/management/DprFinderBase.cs new file mode 100644 index 000000000..9a294cb2e --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/DprFinderBase.cs @@ -0,0 +1,122 @@ +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; +using FASTER.libdpr.proto; + +namespace FASTER.libdpr +{ + public abstract class DprFinderBase : IDprFinder + { + // We maintain two cuts that alternate being updated, and atomically swap them + private Dictionary frontCut, backCut; + private ClusterState frontState, backState; + + protected DprFinderBase() + { + frontCut = new Dictionary(); + backCut = new Dictionary(); + frontState = new ClusterState(); + backState = new ClusterState(); + } + + public long SafeVersion(DprWorkerId dprWorkerId) + { + return frontCut.TryGetValue(dprWorkerId, out var result) ? result : 0; + } + + public long SystemWorldLine() + { + return frontState.currentWorldLine; + } + + public DprStatus CheckStatus(ReadOnlySpan header) + { + ref readonly var dprHeader = ref MemoryMarshal.AsRef(header); + var state = frontState; + + if (dprHeader.WorldLine < state.currentWorldLine) return DprStatus.ROLLEDBACK; + if (dprHeader.SrcWorkerId == DprWorkerId.INVALID) + { + // This is a client dependency that uses the varlen dependency fields, so we need to check that those + // are all committed instead + unsafe + { + fixed (byte* d = dprHeader.data) + { + var depsHead = d + dprHeader.ClientDepsOffset; + for (var i = 0; i < dprHeader.NumClientDeps; i++) + { + ref var wv = ref Unsafe.AsRef(depsHead); + if (SafeVersion(dprHeader.SrcWorkerId) < dprHeader.Version) + return DprStatus.SPECULATIVE; + depsHead += sizeof(WorkerVersion); + } + + return DprStatus.COMMITTED; + } + } + + } + // Otherwise just check the originating worker + return SafeVersion(dprHeader.SrcWorkerId) >= dprHeader.Version ? DprStatus.COMMITTED : DprStatus.SPECULATIVE; + } + + public abstract void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, + IEnumerable deps); + + protected abstract bool Sync(ClusterState stateToUpdate, Dictionary cutToUpdate); + + protected abstract void SendGraphReconstruction(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider); + + protected abstract void AddWorkerInternal(DprWorkerId id); + + public abstract void RemoveWorker(DprWorkerId id); + + public void Refresh(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + // Reset data structures + backCut.Clear(); + backState.currentWorldLine = 1; + backState.worldLinePrefix.Clear(); + + if (!Sync(backState, backCut)) + { + SendGraphReconstruction(id, provider); + Refresh(id, provider); + } + + // Ok to not update the two atomically because cuts are resilient to cluster state changes anyway + backState = Interlocked.Exchange(ref frontState, backState); + backCut = Interlocked.Exchange(ref frontCut, backCut); + } + + public void RefreshStateless() + { + backCut.Clear(); + backState.currentWorldLine = 1; + backState.worldLinePrefix.Clear(); + // Cut is unavailable, do nothing. + if (!Sync(backState, backCut)) return; + + // Ok to not update the two atomically because cuts are resilient to cluster state changes anyway + backState = Interlocked.Exchange(ref frontState, backState); + backCut = Interlocked.Exchange(ref frontCut, backCut); + } + + public long AddWorker(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + RefreshStateless(); + // A blind resending of graph is necessary, in case the coordinator is undergoing recovery and pausing + // processing of new workers until every worker has responded + SendGraphReconstruction(id, provider); + + AddWorkerInternal(id); + + // Get cluster state afterwards to see if recovery is necessary + Refresh(id, provider); + return SafeVersion(id); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/GraphDprFinderBackend.cs b/cs/research/libdpr/src/FASTER.libdpr/management/GraphDprFinderBackend.cs new file mode 100644 index 000000000..c6d46a60b --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/GraphDprFinderBackend.cs @@ -0,0 +1,462 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using FASTER.common; + +namespace FASTER.libdpr +{ + public abstract class PrecomputedSyncResponseBase + { + public ReaderWriterLockSlim rwLatch = new ReaderWriterLockSlim(); + + public abstract void ResetClusterState(ClusterState clusterState); + + public abstract void UpdateCut(Dictionary newCut); + } + + /// + /// Persistent state about the DPR cluster + /// + public class ClusterState + { + /// + /// Latest recorded world-line of the cluster + /// + public long currentWorldLine; + + /// + /// common prefix of the current world-line with previous ones. In other words, this is the cut workers had + /// to recover to before entering the current world-line. This also serves as point of truth of cluster + /// membership, and a worker is recognized as part of the cluster iff they have an entry in this dictionary. + /// Workers that did not participate in the previous world-lines have 0 in the cut. + /// + public Dictionary worldLinePrefix; + + /// + /// Creates a new ClusterState object for an empty cluster + /// + public ClusterState() + { + currentWorldLine = 1; + worldLinePrefix = new Dictionary(); + } + + /// + /// Creates a ClusterState from serialized bytes + /// + /// byte buffer that holds serialized cluster state + /// offset to start scanning + /// end of the serialized ClusterState on the buffer + public int PopulateFromBuffer(byte[] buf, int offset) + { + worldLinePrefix.Clear(); + currentWorldLine = BitConverter.ToInt64(buf, offset); + return RespUtil.ReadDictionaryFromBytes(buf, offset + sizeof(long), worldLinePrefix); + } + + public byte[] SerializeToBytes() + { + // Reserve space for world-line + prefix as a minimum + var result = new byte[sizeof(long) + RespUtil.DictionarySerializedSize(worldLinePrefix)]; + BitConverter.TryWriteBytes(new Span(result, 0, sizeof(long)), currentWorldLine); + RespUtil.SerializeDictionary(worldLinePrefix, result, sizeof(long)); + return result; + } + } + + /// + /// Backend logic for the RespGraphDprFinderServer. + /// The implementation relies on state objects to persist dependencies and avoids incurring additional storage + /// round-trips on the commit critical path. + /// + public class GraphDprFinderBackend + { + // Used to send add/delete worker requests to processing thread + private readonly ConcurrentQueue<(DprWorkerId, Action<(long, long)>)> addQueue = + new ConcurrentQueue<(DprWorkerId, Action<(long, long)>)>(); + + private readonly ConcurrentQueue<(DprWorkerId, Action)> deleteQueue = + new ConcurrentQueue<(DprWorkerId, Action)>(); + + private readonly ConcurrentDictionary> precedenceGraph = + new ConcurrentDictionary>(); + + private readonly SimpleObjectPool> objectPool = + new SimpleObjectPool>(() => new List()); + + private readonly PingPongDevice persistentStorage; + + private readonly ReaderWriterLockSlim clusterChangeLatch = new ReaderWriterLockSlim(); + private readonly Dictionary currentCut = new Dictionary(); + private readonly ClusterState volatileClusterState; + + private bool cutChanged; + private readonly Queue frontier = new Queue(); + private readonly ConcurrentQueue outstandingWvs = new ConcurrentQueue(); + private readonly HashSet visited = new HashSet(); + + // Only used during DprFinder recovery + private readonly RecoveryState recoveryState; + + + private List precomputedResponses; + + + /// + /// Create a new EnhancedDprFinderBackend backed by the given storage. If the storage holds a valid persisted + /// EnhancedDprFinderBackend state, the constructor will attempt to recover from it. + /// + /// persistent storage backing this dpr finder + // TODO(Tianyu): Change to explicitly depend on a log instead of blob storage? + public GraphDprFinderBackend(PingPongDevice persistentStorage) + { + this.persistentStorage = persistentStorage; + // see if a previously persisted state is available + var buf = persistentStorage.ReadLatestCompleteWrite(); + volatileClusterState = new ClusterState(); + if (buf != null) + volatileClusterState.PopulateFromBuffer(buf, 0); + + foreach (var worker in volatileClusterState.worldLinePrefix.Keys) + currentCut.Add(worker, 0); + recoveryState = new RecoveryState(this); + precomputedResponses = new List(); + } + + // Note: Only safe to call when starting the backend before processing starts. + public void AddResponseObjectToPrecompute(PrecomputedSyncResponseBase obj) + { + obj.ResetClusterState(volatileClusterState); + precomputedResponses.Add(obj); + } + + + // Try to commit a single worker version by chasing through its dependencies + // The worker version supplied in the argument must already be reported as persistent + private bool TryCommitWorkerVersion(WorkerVersion wv) + { + // Because wv is already persistent, if it's not in the graph that means it was pruned as part of a commit. + // Ok to return as committed, but no need to mark the cut as changed + if (!precedenceGraph.ContainsKey(wv)) return true; + + + // If version is in the graph but somehow already committed, remove it and reclaim associated resources + if (wv.Version <= currentCut.GetValueOrDefault(wv.DprWorkerId, 0)) + { + // already committed. Remove but do not signal changes to the cut + if (precedenceGraph.TryRemove(wv, out var list)) + objectPool.Return(list); + return true; + } + + // Prepare traversal data structures + visited.Clear(); + frontier.Clear(); + frontier.Enqueue(wv); + + // Breadth first search to find all dependencies + while (frontier.Count != 0) + { + var node = frontier.Dequeue(); + if (visited.Contains(node)) continue; + // If node is committed as determined by the cut, ok to continue + if (currentCut.GetValueOrDefault(node.DprWorkerId, 0) >= node.Version) continue; + // Otherwise, need to check if it is persistent (and therefore present in the graph) + if (!precedenceGraph.TryGetValue(node, out var val)) return false; + + visited.Add(node); + foreach (var dep in val) + { + // No need to add self-dependencies + if (dep.DprWorkerId != node.DprWorkerId) + frontier.Enqueue(dep); + } + } + + // If all dependencies are present, we should commit them all + // This will appear atomic without special protection as we serialize out the cut for sync calls in + // the same thread later on. Other calls reading the cut involve cluster changes and cannot + // interleave with this code + foreach (var committed in visited) + { + // Mark cut as changed so we know to serialize the new cut later on + cutChanged = true; + var version = currentCut.GetValueOrDefault(committed.DprWorkerId, 0); + // Update cut if necessary + if (version < committed.Version) + currentCut[committed.DprWorkerId] = committed.Version; + if (precedenceGraph.TryRemove(committed, out var list)) + objectPool.Return(list); + } + + return true; + } + + /// + /// Performs work to evolve cluster state and find DPR cuts. Must be called repeatedly for the DprFinder to + /// make progress. Process() should only be invoked sequentially, but may be concurrent with other public methods. + /// + public void Process() + { + // Unable to make commit progress until we rebuild precedence graph from worker's persistent storage + if (!recoveryState.RecoveryComplete()) return; + + // Process any cluster change requests + if (!addQueue.IsEmpty || !deleteQueue.IsEmpty) + { + // Because this code-path is rare, ok to allocate new data structures here + var callbacks = new List(); + + // Need to grab an exclusive lock to ensure that if a rollback is triggered, there are no concurrent + // NewCheckpoint calls that can pollute the graph with rolled back versions + clusterChangeLatch.EnterWriteLock(); + + // Process cluster change requests + while (addQueue.TryDequeue(out var entry)) + { + var result = ProcessAddWorker(entry.Item1); + callbacks.Add(() => entry.Item2?.Invoke(result)); + } + + while (deleteQueue.TryDequeue(out var entry)) + { + ProcessDeleteWorker(entry.Item1); + callbacks.Add(() => entry.Item2()); + } + + // serialize new cluster state and persist + var newState = volatileClusterState.SerializeToBytes(); + persistentStorage.WriteReliably(newState, 0, newState.Length); + foreach (var response in precomputedResponses) + { + response.ResetClusterState(volatileClusterState); + response.UpdateCut(currentCut); + } + + clusterChangeLatch.ExitWriteLock(); + + foreach (var callback in callbacks) + callback(); + } + + // Traverse the graph to try and commit versions + TryCommitVersions(); + } + + private void TryCommitVersions(bool tryCommitAll = false) + { + // Perform graph traversal in mutual exclusion with cluster change, but it is ok for traversal to be + // concurrent with adding of new versions to the graph. + clusterChangeLatch.EnterReadLock(); + + // Go through the unprocessed wvs and traverse the graph, unless instructed otherwise, give up after a while + // to return control to the calling thread + var threshold = tryCommitAll ? outstandingWvs.Count : 100; + for (var i = 0; i < threshold; i++) + { + if (!outstandingWvs.TryDequeue(out var wv)) break; + if (!TryCommitWorkerVersion(wv)) + outstandingWvs.Enqueue(wv); + } + + if (cutChanged) + { + // Compute a new syncResponse for consumption + // No need to protect against concurrent changes to the cluster because this method is either called + // on the process thread or during recovery. No cluster change can interleave. + // TODO(Tianyu): Maybe call latches here instead of inside UpdateCut method + foreach (var response in precomputedResponses) + { + response.UpdateCut(currentCut); + } + + cutChanged = false; + } + + clusterChangeLatch.ExitReadLock(); + } + + /// + /// Adds a new checkpoint to the precedence graph with the given dependencies + /// + /// world-line of the checkpoint + /// worker version checkpointed + /// dependencies of the checkpoint + public void NewCheckpoint(long worldLine, WorkerVersion wv, IEnumerable deps) + { + // The DprFinder should be the most up-to-date w.r.t. world-lines and we should not ever receive + // a request from the future. + Debug.Assert(worldLine <= volatileClusterState.currentWorldLine); + Debug.Assert(currentCut.ContainsKey(wv.DprWorkerId)); + try + { + // Cannot interleave NewCheckpoint calls with cluster changes --- cluster changes may change the + // current world-line and remove worker-versions. A concurrent NewCheckpoint may not see that + // and enter a worker-version that should have been removed after cluster change has finished. + clusterChangeLatch.EnterReadLock(); + // Unless the reported versions are in the current world-line (or belong to the common prefix), we should + // not allow this write to go through + if (worldLine != volatileClusterState.currentWorldLine + && wv.Version > volatileClusterState.worldLinePrefix[wv.DprWorkerId]) return; + + // This may be a duplicate + if (currentCut[wv.DprWorkerId] >= wv.Version) return; + + var list = objectPool.Checkout(); + list.Clear(); + list.AddRange(deps); + if (!precedenceGraph.TryAdd(wv, list)) + objectPool.Return(list); + else + outstandingWvs.Enqueue(wv); + } + finally + { + clusterChangeLatch.ExitReadLock(); + } + } + + public void MarkWorkerAccountedFor(DprWorkerId dprWorkerId) + { + // Should only be invoked if recovery is underway. However, a new worker may send a blind graph resend. + if (recoveryState.RecoveryComplete()) return; + // Lock here because can be accessed from multiple threads. No need to lock once all workers are accounted + // for as then only the graph traversal thread will update current cut + lock (currentCut) + { + Debug.Assert(currentCut.ContainsKey(dprWorkerId)); + } + + recoveryState.MarkWorkerAccountedFor(dprWorkerId); + } + + /// + /// Add the worker to the cluster. If the worker is already part of the cluster the DprFinder considers that + /// a failure recovery and triggers necessary next steps. Given callback is invoked when the effect of this + /// call is recoverable on storage + /// + /// worker to add to the cluster + /// callback to invoke when worker addition is persistent + public void AddWorker(DprWorkerId dprWorkerId, Action<(long, long)> callback = null) + { + addQueue.Enqueue(ValueTuple.Create(dprWorkerId, callback)); + } + + private (long, long) ProcessAddWorker(DprWorkerId dprWorkerId) + { + // Before adding a worker, make sure all workers have already reported all (if any) locally outstanding + // checkpoints. We require this to be able to process the request. + if (!recoveryState.RecoveryComplete()) throw new InvalidOperationException(); + (long, long) result; + if (volatileClusterState.worldLinePrefix.TryAdd(dprWorkerId, 0)) + { + // First time we have seen this worker --- start them at current world-line + result = (volatileClusterState.currentWorldLine, 0); + currentCut.Add(dprWorkerId, 0); + cutChanged = true; + } + else + { + // Otherwise, this worker thinks it's booting up for a second time, which means there was a restart. + // We count this as a failure. Advance the cluster world-line + volatileClusterState.currentWorldLine++; + // TODO(Tianyu): This is slightly more aggressive than needed, but no worse than original DPR. Can + // implement more precise rollback later. + foreach (var entry in currentCut) + volatileClusterState.worldLinePrefix[entry.Key] = entry.Value; + // Anything in the precedence graph is rolled back and we can just remove them + foreach (var list in precedenceGraph.Values) + objectPool.Return(list); + precedenceGraph.Clear(); + outstandingWvs.Clear(); + var survivingVersion = currentCut[dprWorkerId]; + result = (volatileClusterState.currentWorldLine, survivingVersion); + } + + return result; + } + + /// + /// Delete a worker from the cluster. It is up to the caller to ensure that the worker is not participating in + /// any future operations or have outstanding unpersisted operations others may depend on. Until callback is + /// invoked, the worker is still considered part of the system and must be recovered if it crashes. + /// + /// the worker to delete from the cluster + /// callback to invoke when worker removal is persistent on storage + public void DeleteWorker(DprWorkerId dprWorkerId, Action callback = null) + { + deleteQueue.Enqueue(ValueTuple.Create(dprWorkerId, callback)); + } + + private void ProcessDeleteWorker(DprWorkerId dprWorkerId) + { + // Before adding a worker, make sure all workers have already reported all (if any) locally outstanding + // checkpoints. We require this to be able to process the request. + if (!recoveryState.RecoveryComplete()) throw new InvalidOperationException(); + + currentCut.Remove(dprWorkerId); + cutChanged = true; + volatileClusterState.worldLinePrefix.Remove(dprWorkerId); + volatileClusterState.worldLinePrefix.Remove(dprWorkerId); + } + + // Recovery state is the information the backend needs to keep when restarting (presumably because + // of failure) from previous on-disk state. + private class RecoveryState + { + private readonly GraphDprFinderBackend backend; + private readonly CountdownEvent countdown; + private bool recoveryComplete; + + private readonly ConcurrentDictionary workersUnaccontedFor = + new ConcurrentDictionary(); + + internal RecoveryState(GraphDprFinderBackend backend) + { + this.backend = backend; + // Check if the cluster is empty + if (backend.volatileClusterState.worldLinePrefix.Count == 0) + { + // If so, we do not need to recover anything, simply mark + // this backend as fully recovered for future operations + recoveryComplete = true; + return; + } + + // Otherwise, we need to first rebuild an in-memory precedence graph from information persisted + // at each state object. + recoveryComplete = false; + // Mark all previously known worker as unaccounted for --- we cannot make any statements about the + // current state of the cluster until we are sure we have up-to-date information from all of them + foreach (var w in backend.volatileClusterState.worldLinePrefix.Keys) + workersUnaccontedFor.TryAdd(w, 0); + countdown = new CountdownEvent(workersUnaccontedFor.Count); + } + + internal bool RecoveryComplete() + { + return recoveryComplete; + } + + // Called when the backend has received all precedence graph information from a worker + internal void MarkWorkerAccountedFor(DprWorkerId dprWorkerId) + { + // A worker may repeatedly check-in due to crashes or other reason, we need to make sure each + // worker decrements the count exactly once + if (!workersUnaccontedFor.TryRemove(dprWorkerId, out _)) return; + + if (!countdown.Signal()) return; + // At this point, we have all information that is at least as up-to-date as when we crashed. We can + // traverse the graph and be sure to reach a conclusion that's at least as up-to-date as the guarantees + // we may have given out before we crashed. + backend.cutChanged = true; + backend.TryCommitVersions(true); + + // Only mark recovery complete after we have reached that conclusion + recoveryComplete = true; + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/IDprFinder.cs b/cs/research/libdpr/src/FASTER.libdpr/management/IDprFinder.cs new file mode 100644 index 000000000..58aa4e2c7 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/IDprFinder.cs @@ -0,0 +1,69 @@ +using System; +using System.Collections.Generic; + +namespace FASTER.libdpr +{ + public enum DprStatus + { + COMMITTED, SPECULATIVE, ROLLEDBACK + } + + /// + /// A DprFinder is the interface on each Worker/Client to report local checkpoint/recovery and receive guarantees/ + /// rollback requests. This may implement a distributed algorithm underneath or be backed by some other backend + /// component. + /// + public interface IDprFinder + { + public delegate IEnumerable> UnprunedVersionsProvider(); + /// + /// For a given version, returns the largest version number that is recoverable. Method may return arbitrary + /// number for a worker that is not part of the cluster. This should be equivalent to calling + /// ReadSnapshot().SafeVersion(worker) for some point in time. + /// + /// + /// The largest version number that is recoverable for the given version (may be arbitrary if worker is + /// not part of the cluster) + /// + long SafeVersion(DprWorkerId dprWorkerId); + + /// + /// Returns the current system world-line. + /// + /// the current system world-line + long SystemWorldLine(); + + /// + /// Report a version as locally persistent with the given dependencies. + /// It suffices for the dependencies to contain only the largest version number for each worker (e.g. if a + /// version depends on (w1, 10) and (w1, 11), it suffices to only include (w1, 11), and need not contain + /// self-dependencies to other versions of the local worker.) + /// + /// + /// + void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, IEnumerable deps); + + /// + /// Refreshes the local view of the system. This method must be called periodically to receive up-to-date + /// information about the rest of the cluster. + /// + void Refresh(DprWorkerId id, UnprunedVersionsProvider provider); + + void RefreshStateless(); + + /// + /// Registers the given worker and state object combination with the cluster. Worker id must be unique within + /// the cluster. Must be invoked before performing any operation on the state object. One DprFinder object + /// should only register one worker. + /// + /// id of the worker + /// the version state object should recover to before beginning execution, or 0 if no recovery is required + long AddWorker(DprWorkerId id, UnprunedVersionsProvider provider); + + /// + /// Removes the registered worker from the cluster. It is up to caller to ensure that the deleted worker is not + /// currently accepting operations and no other worker has outstanding dependencies on the deleted worker. + /// + void RemoveWorker(DprWorkerId id); + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/LocalStubDprFinder.cs b/cs/research/libdpr/src/FASTER.libdpr/management/LocalStubDprFinder.cs new file mode 100644 index 000000000..996c926f7 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/LocalStubDprFinder.cs @@ -0,0 +1,41 @@ + +using System.Collections.Generic; + +namespace FASTER.libdpr; + +public class LocalStubDprFinder : IDprFinder +{ + private long persistedVersion; + + public long SafeVersion(DprWorkerId dprWorkerId) + { + return persistedVersion; + } + + public long SystemWorldLine() + { + return 1; + } + + public void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, IEnumerable deps) + { + persistedVersion = persisted.Version; + } + + public void Refresh(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + } + + public void RefreshStateless() + { + } + + public long AddWorker(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + return 0; + } + + public void RemoveWorker(DprWorkerId id) + { + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/PingPongDevice.cs b/cs/research/libdpr/src/FASTER.libdpr/management/PingPongDevice.cs new file mode 100644 index 000000000..7d3620ac4 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/PingPongDevice.cs @@ -0,0 +1,173 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Security.Cryptography; +using System.Threading; +using FASTER.core; + +namespace FASTER.libdpr +{ + /// + /// A PingPongDevice allows users to reliably and atomically update a value on storage (represented by arbitrary byte + /// arrays) by using two storage devices and alternating between them. If the machine experience a failure before a + /// write is complete, subsequent reads will return the value of most recent completed write. + /// + public class PingPongDevice : IDisposable + { + private readonly MD5 checksumHasher = MD5.Create(); + + private IDevice frontDevice, backDevice; + private long versionCounter; + private bool disposeChild; + + /// + /// Creates a new PingPongDevice from the given two devices. + /// + /// first device + /// second device + public PingPongDevice(IDevice device1, IDevice device2, bool disposeChild = false) + { + Debug.Assert(checksumHasher.HashSize == 128); + this.disposeChild = disposeChild; + var v1 = ReadFromDevice(device1, out _); + var v2 = ReadFromDevice(device2, out _); + if (v1 == -1 && v2 == -1) + { + // No prior writes available, start from scratch + versionCounter = 0; + frontDevice = device1; + backDevice = device2; + } + else if (v1 > v2) + { + versionCounter = v1; + frontDevice = device2; + backDevice = device1; + } + else if (v2 > v1) + { + versionCounter = v2; + frontDevice = device1; + backDevice = device2; + } + else + { + throw new FasterException("The ping-pong device detects corrupted data from the given devices"); + } + } + + /// + /// Dispsoe + /// + public void Dispose() + { + checksumHasher?.Dispose(); + if (disposeChild) + { + backDevice.Dispose(); + frontDevice.Dispose(); + } + } + + /// + /// Reliably writes a byte array to the device to be retrieved later. The value is only retrievable if written + /// completely; otherwise, the old value is preserved + /// + /// + /// + /// + public unsafe void WriteReliably(byte[] buf, int offset, int size) + { + var header = new MetadataHeader(); + header.size = size; + header.version = ++versionCounter; + var hash = checksumHasher.ComputeHash(buf, offset, size); + + fixed (byte* b = &hash[0]) + { + Unsafe.CopyBlock(header.checksum, b, 16); + } + + var countdown = new CountdownEvent(2); + + // Write of metadata block should be atomic + Debug.Assert(frontDevice.SegmentSize == -1 || frontDevice.SegmentSize >= sizeof(MetadataHeader)); + frontDevice.WriteAsync((IntPtr) header.bytes, 0, 0, (uint) sizeof(MetadataHeader), + (e, n, o) => { countdown.Signal(); }, null); + + var handle = GCHandle.Alloc(buf, GCHandleType.Pinned); + // Skip one segment to avoid clobbering with metadata header write + frontDevice.WriteAsync(handle.AddrOfPinnedObject(), 0, frontDevice.SectorSize, (uint) size, + (e, n, o) => + { + countdown.Signal(); + handle.Free(); + }, null); + + countdown.Wait(); + (frontDevice, backDevice) = (backDevice, frontDevice); + countdown.Dispose(); + } + + private unsafe long ReadFromDevice(IDevice device, out byte[] buf) + { + buf = null; + var header = new MetadataHeader(); + var completed = new ManualResetEventSlim(); + try + { + device.ReadAsync(0, 0, (IntPtr)header.bytes, (uint)sizeof(MetadataHeader), + (e, n, o) => completed.Set(), null); + completed.Wait(); + if (header.size == 0) return -1; + + buf = new byte[header.size]; + } + catch (Exception) + { + return -1; + } + + completed = new ManualResetEventSlim(); + fixed (byte* b = &buf[0]) + { + device.ReadAsync(0, device.SectorSize, (IntPtr) b, (uint) header.size, + (e, n, o) => completed.Set(), null); + completed.Wait(); + } + + // Compare the hash with checksum + var contentHash = checksumHasher.ComputeHash(buf); + for (var i = 0; i < contentHash.Length; i++) + if (header.checksum[i] != contentHash[i]) + // Not a complete write, should discard + return -1; + return header.version; + } + + /// + /// Read the last completed write on the device + /// + /// read bytes, or null if the device has never been written to + public byte[] ReadLatestCompleteWrite() + { + var vfront = ReadFromDevice(frontDevice, out var bufFront); + var vback = ReadFromDevice(backDevice, out var bufBack); + if (vback == -1 && vfront == -1) + // No available writes to read back in + return null; + + return vfront > vback ? bufFront : bufBack; + } + + [StructLayout(LayoutKind.Explicit, Size = 32)] + private unsafe struct MetadataHeader + { + [FieldOffset(0)] internal fixed byte bytes[32]; + [FieldOffset(0)] internal long size; + [FieldOffset(8)] internal long version; + [FieldOffset(16)] internal fixed byte checksum[16]; + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/DprFinderGrpcService.cs b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/DprFinderGrpcService.cs new file mode 100644 index 000000000..4163cd641 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/DprFinderGrpcService.cs @@ -0,0 +1,160 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using FASTER.libdpr.proto; +using Grpc.Core; +using Microsoft.Extensions.Hosting; + +namespace FASTER.libdpr +{ + public class GrpcPrecomputedSyncResponse : PrecomputedSyncResponseBase + { + internal SyncResponse obj = new SyncResponse(); + public override void ResetClusterState(ClusterState clusterState) + { + lock (this) + { + var newResponse = new SyncResponse(obj); + newResponse.WorldLine = clusterState.currentWorldLine; + newResponse.WorldLinePrefix.Clear(); + foreach (var entry in clusterState.worldLinePrefix) + newResponse.WorldLinePrefix.Add(new proto.WorkerVersion + { + Id = entry.Key.guid, + Version = entry.Value + }); + obj = newResponse; + } + } + + public override void UpdateCut(Dictionary newCut) + { + lock (this) + { + var newResponse = new SyncResponse(obj); + + newResponse.CurrentCut.Clear(); + foreach (var entry in newCut) + newResponse.CurrentCut.Add(new proto.WorkerVersion + { + Id = entry.Key.guid, + Version = entry.Value + }); + obj = newResponse; + } + } + } + + public class DprFinderGrpcBackgroundService : BackgroundService + { + private readonly GraphDprFinderBackend backend; + private GrpcPrecomputedSyncResponse response; + private Thread processingThread; + + public DprFinderGrpcBackgroundService(GraphDprFinderBackend backend) + { + this.backend = backend; + response = new GrpcPrecomputedSyncResponse(); + backend.AddResponseObjectToPrecompute(response); + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + processingThread = new Thread(() => + { + while (!stoppingToken.IsCancellationRequested) + { + backend.Process(); + Thread.Yield(); + } + }); + processingThread.Start(); + await Task.Delay(Timeout.Infinite, stoppingToken); + processingThread.Join(); + } + + public Task AddWorker(AddWorkerRequest request) + { + var result = new TaskCompletionSource(); + backend.AddWorker(new DprWorkerId(request.Id), + r => result.SetResult(new AddWorkerResponse + { Id = request.Id, WorldLine = r.Item1, RecoveredVersion = r.Item2 })); + return result.Task; + } + + public Task RemoveWorker(RemoveWorkerRequest request) + { + var result = new TaskCompletionSource(); + backend.DeleteWorker(new DprWorkerId(request.Id), + () => result.SetResult(new RemoveWorkerResponse { Ok = true })); + return result.Task; + } + + public Task NewCheckpoint(NewCheckpointRequest request) + { + backend.NewCheckpoint(request.WorldLine, new WorkerVersion(request.Id, request.Version), + request.Deps.Select(wv => new WorkerVersion(wv.Id, wv.Version))); + return Task.FromResult(new NewCheckpointResponse + { + Ok = true + }); + } + + public Task Sync() + { + return Task.FromResult(response.obj); + } + + public Task ResendGraph(ResendGraphRequest request) + { + foreach (var n in request.GraphNodes) + { + backend.NewCheckpoint(n.WorldLine, new WorkerVersion(n.Id, n.Version), + n.Deps.Select(wv => new WorkerVersion(wv.Id, wv.Version))); + } + backend.MarkWorkerAccountedFor(new DprWorkerId(request.Id)); + return Task.FromResult(new ResendGraphResponse + { + Ok = true + }); + } + } + + public class DprFinderGrpcService : DprFinder.DprFinderBase + { + private DprFinderGrpcBackgroundService backend; + + public DprFinderGrpcService(DprFinderGrpcBackgroundService backend) + { + this.backend = backend; + } + + public override Task AddWorker(AddWorkerRequest request, ServerCallContext context) + { + return backend.AddWorker(request); + } + + public override Task RemoveWorker(RemoveWorkerRequest request, ServerCallContext context) + { + return backend.RemoveWorker(request); + } + + public override Task NewCheckpoint(NewCheckpointRequest request, + ServerCallContext context) + { + return backend.NewCheckpoint(request); + } + + public override Task Sync(SyncRequest request, ServerCallContext context) + { + return backend.Sync(); + } + + public override Task ResendGraph(ResendGraphRequest request, ServerCallContext context) + { + return backend.ResendGraph(request); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/GrpcDprFinder.cs b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/GrpcDprFinder.cs new file mode 100644 index 000000000..36feec7ae --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/GrpcDprFinder.cs @@ -0,0 +1,95 @@ +using System.Collections.Generic; +using FASTER.libdpr.proto; +using Grpc.Net.Client; + +namespace FASTER.libdpr +{ + public class GrpcDprFinder : DprFinderBase + { + private DprFinder.DprFinderClient finderClient; + + public GrpcDprFinder(GrpcChannel channel) + { + finderClient = new DprFinder.DprFinderClient(channel); + } + + public override void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, + IEnumerable deps) + { + var request = new NewCheckpointRequest + { + Id = persisted.DprWorkerId.guid, + Version = persisted.Version, + WorldLine = worldLine + }; + foreach (var dep in deps) + request.Deps.Add(new proto.WorkerVersion + { + Id = dep.DprWorkerId.guid, + Version = dep.Version + }); + + // Can just leave async without waiting to complete + finderClient.NewCheckpointAsync(request); + } + + protected override bool Sync(ClusterState stateToUpdate, Dictionary cutToUpdate) + { + var response = finderClient.Sync(new SyncRequest()); + if (response.CurrentCut.Count == 0) return false; + + stateToUpdate.currentWorldLine = response.WorldLine; + foreach (var entry in response.WorldLinePrefix) + stateToUpdate.worldLinePrefix.Add(new DprWorkerId(entry.Id), entry.Version); + foreach (var entry in response.CurrentCut) + cutToUpdate.Add(new DprWorkerId(entry.Id), entry.Version); + return true; + } + + protected override void SendGraphReconstruction(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + var checkpoints = provider(); + var request = new ResendGraphRequest + { + Id = id.guid + }; + foreach (var m in checkpoints) + { + SerializationUtil.DeserializeCheckpointMetadata(m.Span, + out var worldLine, out var wv, out var deps); + var checkpointRequest = new NewCheckpointRequest + { + Id = id.guid, + Version = wv.Version, + WorldLine = worldLine + }; + foreach (var dep in deps) + checkpointRequest.Deps.Add(new proto.WorkerVersion + { + Id = dep.DprWorkerId.guid, + Version = dep.Version + }); + request.GraphNodes.Add(checkpointRequest); + finderClient.NewCheckpoint(checkpointRequest); + } + + finderClient.ResendGraph(request); + } + + protected override void AddWorkerInternal(DprWorkerId id) + { + finderClient.AddWorker(new AddWorkerRequest + { + Id = id.guid + }); + } + + public override void RemoveWorker(DprWorkerId id) + { + finderClient.RemoveWorker(new RemoveWorkerRequest + { + Id = id.guid + }); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/finder.proto b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/finder.proto new file mode 100644 index 000000000..a3cf23b1e --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/gRPC/finder.proto @@ -0,0 +1,70 @@ +syntax = "proto3"; + +option csharp_namespace = "FASTER.libdpr.proto"; + +message WorkerVersion { + int64 id = 1; + int64 version = 2; +} + +message AddWorkerRequest { + int64 id = 1; +} + +message AddWorkerResponse { + int64 id = 1; + int64 worldLine = 2; + int64 recoveredVersion = 3; +} + +message RemoveWorkerRequest { + int64 id = 1; +} + +message RemoveWorkerResponse { + bool ok = 1; +} + +message NewCheckpointRequest { + int64 id = 1; + int64 version = 2; + int64 worldLine = 3; + repeated WorkerVersion deps = 4; +} + +message NewCheckpointResponse { + bool ok = 1; +} + +message SyncRequest { + int64 id = 1; +} + +message SyncResponse { + int64 worldLine = 1; + repeated WorkerVersion worldLinePrefix = 2; + repeated WorkerVersion currentCut = 3; +} + +message ResendGraphRequest { + int64 id = 1; + repeated NewCheckpointRequest graphNodes = 2; +} + +message ResendGraphResponse { + bool ok = 1; +} + + +// TODO(Tianyu): Investigate whether streaming variants will make these more efficient +service DprFinder { + rpc AddWorker(AddWorkerRequest) returns (AddWorkerResponse); + + rpc RemoveWorker(RemoveWorkerRequest) returns (RemoveWorkerResponse); + + rpc NewCheckpoint(NewCheckpointRequest) returns (NewCheckpointResponse); + + rpc Sync(SyncRequest) returns (SyncResponse); + + rpc ResendGraph(ResendGraphRequest) returns (ResendGraphResponse); +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/resp/MessageUtil.cs b/cs/research/libdpr/src/FASTER.libdpr/management/resp/MessageUtil.cs new file mode 100644 index 000000000..d20a663f1 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/resp/MessageUtil.cs @@ -0,0 +1,254 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Net.Sockets; +using System.Text; +using FASTER.common; +using FASTER.core; + +namespace FASTER.libdpr +{ + internal class DprFinderSocketReaderWriter : IDisposable + { + private static readonly byte[] OkResponse = Encoding.GetEncoding("ASCII").GetBytes("+OK\r\n"); + + // TODO(Tianyu): size information? + private readonly ThreadLocalObjectPool reusableMessageBuffers = + new ThreadLocalObjectPool(() => new byte[1 << 15]); + + private Socket socket; + + internal DprFinderSocketReaderWriter(Socket socket) + { + this.socket = socket; + } + + public void Dispose() + { + socket.Dispose(); + reusableMessageBuffers.Dispose(); + } + + internal void WaitForAcks(int numAcks) + { + // Wait for all of the sent commands to be acked + var received = 0; + var buf = reusableMessageBuffers.Checkout(); + while (received < numAcks * 5) + received += socket.Receive(buf); + reusableMessageBuffers.Return(buf); + } + + internal int ReceiveInto(byte[] buffer) + { + return socket.Receive(buffer); + } + + internal void SendOk() + { + socket.Send(OkResponse); + } + + internal int SendGraphReconstruction(DprWorkerId dprWorkerId, IDprFinder.UnprunedVersionsProvider provider) + { + var buf = reusableMessageBuffers.Checkout(); + var head = 0; + var checkpoints = provider(); + var minVersion = long.MaxValue; + var numRequests = 0; + foreach (var m in checkpoints) + { + SerializationUtil.DeserializeCheckpointMetadata(m.Span, + out var worldLine, out var wv, out var deps); + head += RespUtil.WriteRedisArrayHeader(4, buf, head); + head += RespUtil.WriteRedisBulkString("NewCheckpoint", buf, head); + head += RespUtil.WriteRedisBulkString(worldLine, buf, head); + head += RespUtil.WriteRedisBulkString(wv, buf, head); + head += RespUtil.WriteRedisBulkString(deps, buf, head); + if (minVersion > wv.Version) minVersion = wv.Version; + numRequests++; + } + + if (numRequests == 0) return 0; + head += RespUtil.WriteRedisArrayHeader(2, buf, head); + head += RespUtil.WriteRedisBulkString("GraphResent", buf, head); + var committedVersion = new WorkerVersion(dprWorkerId, minVersion == long.MaxValue ? 0 : minVersion); + head += RespUtil.WriteRedisBulkString(committedVersion, buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + return 1; + } + + internal void SendAddWorkerCommand(DprWorkerId dprWorkerId) + { + var buf = reusableMessageBuffers.Checkout(); + var head = RespUtil.WriteRedisArrayHeader(2, buf, 0); + head += RespUtil.WriteRedisBulkString("AddWorker", buf, head); + head += RespUtil.WriteRedisBulkString(dprWorkerId.guid, buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendDeleteWorkerCommand(DprWorkerId dprWorkerId) + { + var buf = reusableMessageBuffers.Checkout(); + var head = RespUtil.WriteRedisArrayHeader(2, buf, 0); + head += RespUtil.WriteRedisBulkString("DeleteWorker", buf, head); + head += RespUtil.WriteRedisBulkString(dprWorkerId.guid, buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendNewCheckpointCommand(long worldLine, WorkerVersion checkpointed, + IEnumerable deps) + { + var buf = reusableMessageBuffers.Checkout(); + var head = RespUtil.WriteRedisArrayHeader(4, buf, 0); + head += RespUtil.WriteRedisBulkString("NewCheckpoint", buf, head); + head += RespUtil.WriteRedisBulkString(worldLine, buf, head); + head += RespUtil.WriteRedisBulkString(checkpointed, buf, head); + head += RespUtil.WriteRedisBulkString(deps, buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendReportRecoveryCommand(WorkerVersion recovered, long worldLine) + { + var buf = reusableMessageBuffers.Checkout(); + var head = RespUtil.WriteRedisArrayHeader(3, buf, 0); + head += RespUtil.WriteRedisBulkString("ReportRecovery", buf, head); + head += RespUtil.WriteRedisBulkString(recovered, buf, head); + head += RespUtil.WriteRedisBulkString(worldLine, buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendSyncCommand() + { + var buf = reusableMessageBuffers.Checkout(); + var head = RespUtil.WriteRedisArrayHeader(1, buf, 0); + head += RespUtil.WriteRedisBulkString("Sync", buf, head); + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendSyncResponse(Span serializedBuffer) + { + var buf = reusableMessageBuffers.Checkout(); + var head = 0; + buf[head++] = (byte)'$'; + + var size = RespUtil.LongToDecimalString(serializedBuffer.Length, buf, head); + Debug.Assert(size != 0); + head += size; + + Debug.Assert(head + 4 + serializedBuffer.Length < buf.Length); + buf[head++] = (byte)'\r'; + buf[head++] = (byte)'\n'; + + head += sizeof(long); + serializedBuffer.CopyTo(new Span(buf, head, buf.Length - head)); + head += serializedBuffer.Length; + + buf[head++] = (byte)'\r'; + buf[head++] = (byte)'\n'; + + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + + internal void SendAddWorkerResponse((long, long) result) + { + var buf = reusableMessageBuffers.Checkout(); + var head = 0; + buf[head++] = (byte)'$'; + + var size = RespUtil.LongToDecimalString(2 * sizeof(long), buf, head); + Debug.Assert(size != 0); + head += size; + + Debug.Assert(head + 4 + 2 * sizeof(long) < buf.Length); + buf[head++] = (byte)'\r'; + buf[head++] = (byte)'\n'; + + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), result.Item1); + head += sizeof(long); + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), result.Item2); + head += sizeof(long); + + buf[head++] = (byte)'\r'; + buf[head++] = (byte)'\n'; + + socket.Send(buf, 0, head, SocketFlags.None); + reusableMessageBuffers.Return(buf); + } + } + + internal class DprFinderRedisProtocolConnState + { + private readonly Action commandHandler; + private readonly DprFinderCommandParser parser = new DprFinderCommandParser(); + private int readHead, bytesRead, commandStart; + private readonly DprFinderSocketReaderWriter readerWriter; + private readonly Socket socket; + + internal DprFinderRedisProtocolConnState(Socket socket, Action commandHandler) + { + this.socket = socket; + readerWriter = new DprFinderSocketReaderWriter(socket); + this.commandHandler = commandHandler; + } + + private static bool HandleReceiveCompletion(SocketAsyncEventArgs e) + { + var connState = (DprFinderRedisProtocolConnState)e.UserToken; + if (e.BytesTransferred == 0 || e.SocketError != SocketError.Success) + { + connState.socket.Dispose(); + e.Dispose(); + return false; + } + + connState.bytesRead += e.BytesTransferred; + for (; connState.readHead < connState.bytesRead; connState.readHead++) + if (connState.parser.ProcessChar(connState.readHead, e.Buffer)) + { + connState.commandHandler(connState.parser.currentCommand, connState.readerWriter); + connState.commandStart = connState.readHead + 1; + } + + // TODO(Tianyu): Magic number + // If less than some certain number of bytes left in the buffer, shift buffer content to head to free + // up some space. Don't want to do this too often. Obviously ok to do if no bytes need to be copied ( + // the current end of buffer marks the end of a command, and we can discard the entire buffer). + if (e.Buffer.Length - connState.readHead < 4096 || connState.readHead == connState.commandStart) + { + var bytesLeft = connState.bytesRead - connState.commandStart; + // Shift buffer to front + Array.Copy(e.Buffer, connState.commandStart, e.Buffer, 0, bytesLeft); + connState.bytesRead = bytesLeft; + connState.readHead -= connState.commandStart; + connState.commandStart = 0; + } + + e.SetBuffer(connState.readHead, e.Buffer.Length - connState.readHead); + return true; + } + + internal static void RecvEventArg_Completed(object sender, SocketAsyncEventArgs e) + { + var connState = (DprFinderRedisProtocolConnState)e.UserToken; + try + { + do + { + // No more things to receive + if (!HandleReceiveCompletion(e)) return; + } while (!connState.socket.ReceiveAsync(e)); + } + catch (ObjectDisposedException) + { + // Probably caused by a normal cancellation from this side. Ok to ignore + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/resp/RedisParsing.cs b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RedisParsing.cs new file mode 100644 index 000000000..31ae867a1 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RedisParsing.cs @@ -0,0 +1,277 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; + +namespace FASTER.libdpr +{ + internal class DprFinderResponseParser + { + internal int size = -1; + internal int stringStart = -1; + + public bool ProcessChar(int readHead, byte[] buf) + { + if (readHead == 0) + { + Debug.Assert((char) buf[readHead] == '$'); + size = -1; + } + + switch ((char) buf[readHead]) + { + case '\n': + if (buf[readHead - 1] != '\r') return false; + if (size == -1) + { + // Implicit message start at 0 always + size = (int) RespUtil.LongFromDecimalString(buf, 1, readHead - 1); + stringStart = readHead + 1; + return false; + } + + return readHead == stringStart + size + 1; + default: + // Nothing to do + return false; + } + } + } + + internal struct DprFinderCommand + { + internal enum Type + { + NEW_CHECKPOINT, + REPORT_RECOVERY, + ADD_WORKER, + DELETE_WORKER, + SYNC, + GRAPH_RESENT + } + + internal Type commandType; + internal WorkerVersion wv; + internal DprWorkerId w; + internal long worldLine; + internal List deps; + } + + internal enum CommandParserState + { + NONE, + NUM_ARGS, + COMMAND_TYPE, + ARG_W, + ARG_WV, + ARG_WL, + ARG_DEPS + } + + internal class DprFinderCommandParser + { + internal CommandParserState commandParserState; + internal DprFinderCommand currentCommand; + internal int currentCommandStart = -1, currentFragmentStart, size, stringStart; + + internal DprFinderCommandParser() + { + currentCommand.deps = new List(); + } + + private void ProcessCommandStart(int readHead, byte[] buf) + { + currentCommandStart = readHead; + // Initialize to an invalid + size = -1; + switch ((char) buf[readHead]) + { + case '*': + commandParserState = CommandParserState.NUM_ARGS; + currentFragmentStart = readHead; + break; + default: + throw new NotImplementedException("Unsupported RESP syntax --- we only" + + "support DPR commands sent as BULK_STRING"); + } + } + + private bool ProcessRedisInt(int readHead, byte[] buf, out long result) + { + result = default; + if (buf[readHead - 1] != '\r' || buf[readHead] != '\n') return false; + result = RespUtil.LongFromDecimalString(buf, currentFragmentStart + 1, readHead - 1); + // Fragment has ended + currentFragmentStart = readHead + 1; + return true; + } + + private bool ProcessRedisBulkString(int readHead, byte[] buf) + { + // account for \r\n in the end of string field + if (size != -1 && readHead == stringStart + size + 1) + + { + // Fragment has ended + currentFragmentStart = readHead + 1; + return true; + } + + if (size == -1 && buf[readHead] == '\n' && buf[readHead - 1] == '\r') + { + // This is the first field, should read the size. The integer size field starts one past + // the message type byte and ends at '\r' + size = (int) RespUtil.LongFromDecimalString(buf, currentFragmentStart + 1, readHead - 1); + + if (size == -1) throw new NotImplementedException("Null Bulk String not supported"); + + stringStart = readHead + 1; + } + + return false; + } + + internal unsafe bool ProcessChar(int readHead, byte[] buf) + { + switch (commandParserState) + { + case CommandParserState.NONE: + ProcessCommandStart(readHead, buf); + return false; + case CommandParserState.NUM_ARGS: + { + if (ProcessRedisInt(readHead, buf, out var size)) + commandParserState = CommandParserState.COMMAND_TYPE; + return false; + } + case CommandParserState.COMMAND_TYPE: + if (ProcessRedisBulkString(readHead, buf)) + { + switch ((char) buf[stringStart]) + { + case 'N': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size) + .Equals("NewCheckpoint")); + currentCommand.commandType = DprFinderCommand.Type.NEW_CHECKPOINT; + commandParserState = CommandParserState.ARG_WL; + break; + case 'R': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size) + .Equals("ReportRecovery")); + currentCommand.commandType = DprFinderCommand.Type.REPORT_RECOVERY; + commandParserState = CommandParserState.ARG_WV; + break; + case 'A': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size) + .Equals("AddWorker")); + currentCommand.commandType = DprFinderCommand.Type.ADD_WORKER; + commandParserState = CommandParserState.ARG_W; + break; + case 'D': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size) + .Equals("DeleteWorker")); + currentCommand.commandType = DprFinderCommand.Type.DELETE_WORKER; + commandParserState = CommandParserState.ARG_W; + break; + case 'S': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size).Equals("Sync")); + currentCommand.commandType = DprFinderCommand.Type.SYNC; + commandParserState = CommandParserState.NONE; + return true; + case 'G': + Debug.Assert(Encoding.ASCII.GetString(buf, stringStart, size) + .Equals("GraphResent")); + currentCommand.commandType = DprFinderCommand.Type.GRAPH_RESENT; + commandParserState = CommandParserState.ARG_WV; + break; + default: + throw new NotImplementedException("Unrecognized command type"); + } + + size = -1; + } + + return false; + case CommandParserState.ARG_W: + if (ProcessRedisBulkString(readHead, buf)) + { + var workerId = BitConverter.ToInt64(buf, stringStart); + currentCommand.w = new DprWorkerId(workerId); + commandParserState = CommandParserState.NONE; + size = -1; + return true; + } + + return false; + case CommandParserState.ARG_WV: + if (ProcessRedisBulkString(readHead, buf)) + { + Debug.Assert(size == sizeof(WorkerVersion)); + var workerId = BitConverter.ToInt64(buf, stringStart); + var version = BitConverter.ToInt64(buf, stringStart + sizeof(long)); + currentCommand.wv = new WorkerVersion(workerId, version); + if (currentCommand.commandType == DprFinderCommand.Type.NEW_CHECKPOINT) + commandParserState = CommandParserState.ARG_DEPS; + else if (currentCommand.commandType == DprFinderCommand.Type.REPORT_RECOVERY) + commandParserState = CommandParserState.ARG_WL; + else if (currentCommand.commandType == DprFinderCommand.Type.GRAPH_RESENT) + { + commandParserState = CommandParserState.NONE; + return true; + } + else + Debug.Assert(false); + + size = -1; + } + + return false; + case CommandParserState.ARG_WL: + if (ProcessRedisBulkString(readHead, buf)) + { + Debug.Assert(size == sizeof(long)); + currentCommand.worldLine = BitConverter.ToInt64(buf, stringStart); + size = -1; + if (currentCommand.commandType == DprFinderCommand.Type.NEW_CHECKPOINT) + { + commandParserState = CommandParserState.ARG_WV; + } + else if (currentCommand.commandType == DprFinderCommand.Type.REPORT_RECOVERY) + { + commandParserState = CommandParserState.NONE; + return true; + } + else + { + Debug.Assert(false); + } + } + + return false; + case CommandParserState.ARG_DEPS: + if (ProcessRedisBulkString(readHead, buf)) + { + currentCommand.deps.Clear(); + var numDeps = BitConverter.ToInt32(buf, stringStart); + for (var i = 0; i < numDeps; i++) + { + // TODO(Tianyu): Replace with WV version + var workerId = BitConverter.ToInt32(buf, + stringStart + sizeof(int) + i * sizeof(WorkerVersion)); + var version = BitConverter.ToInt32(buf, + stringStart + 2 * sizeof(int) + i * sizeof(WorkerVersion)); + currentCommand.deps.Add(new WorkerVersion(workerId, version)); + size = -1; + } + + commandParserState = CommandParserState.NONE; + return true; + } + + return false; + default: + throw new NotImplementedException("Unrecognized Parser state"); + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinder.cs b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinder.cs new file mode 100644 index 000000000..5d73d018d --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinder.cs @@ -0,0 +1,112 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Net.Sockets; + +namespace FASTER.libdpr +{ + /// + /// DPR Finder client stub that connects to a backend RespGraphDprFinderServer + /// + public sealed class RespGraphDprFinder : DprFinderBase, IDisposable + { + private readonly DprFinderSocketReaderWriter socket; + private readonly DprFinderResponseParser parser = new DprFinderResponseParser(); + private readonly byte[] recvBuffer = new byte[1 << 20]; + + /// + /// Create a new DprFinder using the supplied socket + /// + /// socket to use + /// + public RespGraphDprFinder(Socket dprFinderConn) + { + dprFinderConn.NoDelay = true; + socket = new DprFinderSocketReaderWriter(dprFinderConn); + } + + /// + /// Create a new DprFinder by connecting to the given endpoint + /// + /// IP address of the desired endpoint + /// port of the desired endpoint + public RespGraphDprFinder(string ip, int port) + { + var ipEndpoint = new IPEndPoint(IPAddress.Parse(ip), port); + var conn = new Socket(ipEndpoint.AddressFamily, SocketType.Stream, ProtocolType.Tcp); + conn.NoDelay = true; + conn.Connect(ipEndpoint); + socket = new DprFinderSocketReaderWriter(conn); + } + + /// + /// Dispose + /// + public void Dispose() + { + socket.Dispose(); + } + + /// + public override void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, IEnumerable deps) + { + lock (socket) + socket.SendNewCheckpointCommand(worldLine, persisted, deps); + } + + protected override bool Sync(ClusterState stateToUpdate, Dictionary cutToUpdate) + { + lock (socket) + { + socket.SendSyncCommand(); + ProcessRespResponse(); + + var head = stateToUpdate.PopulateFromBuffer(recvBuffer, parser.stringStart + sizeof(long)); + if (BitConverter.ToInt32(recvBuffer, head) == -1) return false; + + RespUtil.ReadDictionaryFromBytes(recvBuffer, head, cutToUpdate); + return true; + } + } + + protected override void SendGraphReconstruction(DprWorkerId id, IDprFinder.UnprunedVersionsProvider provider) + { + lock (socket) + { + var acks = socket.SendGraphReconstruction(id, provider); + socket.WaitForAcks(acks); + } + } + + protected override void AddWorkerInternal(DprWorkerId id) + { + lock (socket) + { + socket.SendAddWorkerCommand(id); + ProcessRespResponse(); + } + } + + /// + public override void RemoveWorker(DprWorkerId id) + { + lock (socket) + { + socket.SendDeleteWorkerCommand(id); + socket.WaitForAcks(1); + } + } + + private void ProcessRespResponse() + { + int i = 0, receivedSize = 0; + while (true) + { + receivedSize += socket.ReceiveInto(recvBuffer); + for (; i < receivedSize; i++) + if (parser.ProcessChar(i, recvBuffer)) + return; + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinderServer.cs b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinderServer.cs new file mode 100644 index 000000000..b0a6b5862 --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespGraphDprFinderServer.cs @@ -0,0 +1,196 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Net.Sockets; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.libdpr +{ + + /// + /// Precomputed response to sync calls into DprFinder. Holds both serialized cluster persistent state and the + /// current DPR cut. + /// + public class RespPrecomputedSyncResponse : PrecomputedSyncResponseBase + { + private int recoveryStateEnd; + private int responseEnd; + private byte[] serializedResponse = new byte[1 << 15]; + + public Span GetResponseBytes() => new Span(serializedResponse, 0, responseEnd); + + public override void ResetClusterState(ClusterState clusterState) + { + rwLatch.EnterWriteLock(); + // Reserve space for world-line + prefix + size field of cut as a minimum + var serializedSize = sizeof(long) + RespUtil.DictionarySerializedSize(clusterState.worldLinePrefix) + + sizeof(int); + // Resize response buffer to fit + if (serializedSize > serializedResponse.Length) + serializedResponse = new byte[Math.Max(2 * serializedResponse.Length, serializedSize)]; + + BitConverter.TryWriteBytes(new Span(serializedResponse, 0, sizeof(long)), + clusterState.currentWorldLine); + recoveryStateEnd = + RespUtil.SerializeDictionary(clusterState.worldLinePrefix, serializedResponse, sizeof(long)); + // In the absence of a cut, set cut to a special "unknown" value. + BitConverter.TryWriteBytes(new Span(serializedResponse, recoveryStateEnd, sizeof(int)), -1); + responseEnd = recoveryStateEnd + sizeof(int); + rwLatch.ExitWriteLock(); + } + + /// + /// Update the PrecomputedSyncResponse to hold the given cut + /// + /// DPR cut to serialize + public override void UpdateCut(Dictionary newCut) + { + // Update serialized under write latch so readers cannot see partial updates + rwLatch.EnterWriteLock(); + var serializedSize = RespUtil.DictionarySerializedSize(newCut); + + // Resize response buffer to fit + if (serializedSize > serializedResponse.Length - recoveryStateEnd) + { + var newBuffer = new byte[Math.Max(2 * serializedResponse.Length, recoveryStateEnd + serializedSize)]; + Array.Copy(serializedResponse, newBuffer, recoveryStateEnd); + serializedResponse = newBuffer; + } + + responseEnd = RespUtil.SerializeDictionary(newCut, serializedResponse, recoveryStateEnd); + rwLatch.ExitWriteLock(); + } + } + + /// + /// A simple single-server DprFinder implementation relying primarily on graph traversal. + /// Fault-tolerant provided that the runtime environment can restart the server on the same storage volume + /// and IP address in bounded time (fail-restart model). + /// The server speaks the Redis protocol and appears as a Redis server that supports the following commands:\ + /// AddWorker(worker) -> OK + /// RemoveWorker(worker) -> OK + /// NewCheckpoint(wv, deps) -> OK + /// Sync() -> state + /// All parameters and return values are Redis bulk strings of bytes that encode the corresponding C# + /// object with the exception of return values of '+OK\r\n's + /// + public class RespGraphDprFinderServer : IDisposable + { + private readonly GraphDprFinderBackend backend; + private readonly string ip; + private readonly int port; + private Thread processThread; + private Socket servSocket; + private ManualResetEventSlim termination; + private RespPrecomputedSyncResponse precomputedResponse; + + /// + /// Constructs a new RespGraphDrpFinderServer instance at the given ip, listening on the given port, + /// and using the given backend object + /// + /// ip address of server + /// port to listen on the server + /// backend of the server + public RespGraphDprFinderServer(string ip, int port, GraphDprFinderBackend backend) + { + this.ip = ip; + this.port = port; + this.backend = backend; + precomputedResponse = new RespPrecomputedSyncResponse(); + backend.AddResponseObjectToPrecompute(precomputedResponse); + } + + /// + public void Dispose() + { + servSocket.Dispose(); + // TODO(Tianyu): Clean shutdown of client connections + termination.Set(); + processThread.Join(); + } + + /// + /// Main server loop for DPR finding + /// + public void StartServer() + { + termination = new ManualResetEventSlim(); + + processThread = new Thread(() => + { + while (!termination.IsSet) + backend.Process(); + }); + processThread.Start(); + + var ipAddr = IPAddress.Parse(ip); + var endPoint = new IPEndPoint(ipAddr, port); + servSocket = new Socket(ipAddr.AddressFamily, SocketType.Stream, ProtocolType.Tcp); + servSocket.Bind(endPoint); + servSocket.Listen(512); + servSocket.NoDelay = true; + + var acceptEventArg = new SocketAsyncEventArgs(); + acceptEventArg.Completed += AcceptEventArg_Completed; + if (!servSocket.AcceptAsync(acceptEventArg)) + AcceptEventArg_Completed(null, acceptEventArg); + } + + private bool HandleNewClientConnection(SocketAsyncEventArgs e) + { + if (e.SocketError != SocketError.Success) + { + e.Dispose(); + return false; + } + + e.AcceptSocket.NoDelay = true; + // Set up listening events + var saea = new SocketAsyncEventArgs(); + saea.SetBuffer(new byte[1 << 15], 0, 1 << 15); + saea.UserToken = new DprFinderRedisProtocolConnState(e.AcceptSocket, HandleClientCommand); + saea.Completed += DprFinderRedisProtocolConnState.RecvEventArg_Completed; + // If the client already have packets, avoid handling it here on the handler thread so we don't block future accepts. + if (!e.AcceptSocket.ReceiveAsync(saea)) + Task.Run(() => DprFinderRedisProtocolConnState.RecvEventArg_Completed(null, saea)); + return true; + } + + private void AcceptEventArg_Completed(object sender, SocketAsyncEventArgs e) + { + do + { + if (!HandleNewClientConnection(e)) break; + e.AcceptSocket = null; + } while (!servSocket.AcceptAsync(e)); + } + + private void HandleClientCommand(DprFinderCommand command, DprFinderSocketReaderWriter socket) + { + switch (command.commandType) + { + case DprFinderCommand.Type.NEW_CHECKPOINT: + backend.NewCheckpoint(command.worldLine, command.wv, command.deps); + break; + case DprFinderCommand.Type.GRAPH_RESENT: + backend.MarkWorkerAccountedFor(command.wv.DprWorkerId); + socket.SendOk(); + break; + case DprFinderCommand.Type.SYNC: + precomputedResponse.rwLatch.EnterReadLock(); + socket.SendSyncResponse(precomputedResponse.GetResponseBytes()); + precomputedResponse.rwLatch.ExitReadLock(); + break; + case DprFinderCommand.Type.ADD_WORKER: + backend.AddWorker(command.w, socket.SendAddWorkerResponse); + break; + case DprFinderCommand.Type.DELETE_WORKER: + backend.DeleteWorker(command.w, socket.SendOk); + break; + default: + throw new ArgumentOutOfRangeException(); + } + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespUtil.cs b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespUtil.cs new file mode 100644 index 000000000..2d556260f --- /dev/null +++ b/cs/research/libdpr/src/FASTER.libdpr/management/resp/RespUtil.cs @@ -0,0 +1,204 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using FASTER.core; + +namespace FASTER.libdpr +{ + public static class RespUtil + { + internal static unsafe int LongToDecimalString(long a, byte[] buf, int offset) + { + var digits = stackalloc byte[20]; + var numDigits = 0; + do + { + digits[numDigits] = (byte) (a % 10 + '0'); + numDigits++; + a /= 10; + } while (a > 0); + + var head = offset; + + if (head + numDigits >= buf.Length) return 0; + for (var i = numDigits - 1; i >= 0; i--) + buf[head++] = digits[i]; + return head - offset; + } + + internal static long LongFromDecimalString(byte[] buf, int start, int end) + { + var negative = false; + if (buf[start] == '-') + { + negative = true; + start++; + } + + long result = 0; + + for (var i = start; i < end; i++) + { + result *= 10; + result += buf[i] - '0'; + } + + return negative ? -result : result; + } + + internal static int WriteRedisBulkString(string val, byte[] buf, int offset) + { + var head = offset; + if (head + 1 >= buf.Length) return 0; + buf[head++] = (byte) '$'; + + var size = LongToDecimalString(val.Length, buf, head); + if (size == 0) return 0; + head += size; + + if (head + 4 + val.Length >= buf.Length) return 0; + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + + foreach (var t in val) + buf[head++] = (byte) t; + + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + return head - offset; + } + + internal static int WriteRedisBulkString(long val, byte[] buf, int offset) + { + var head = offset; + if (head + 1 >= buf.Length) return 0; + buf[head++] = (byte) '$'; + + var size = LongToDecimalString(sizeof(long), buf, head); + if (size == 0) return 0; + head += size; + + if (head + 4 + sizeof(long) >= buf.Length) return 0; + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), val); + head += sizeof(long); + + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + return head - offset; + } + + internal static unsafe int WriteRedisBulkString(WorkerVersion val, byte[] buf, int offset) + { + var head = offset; + if (head + sizeof(byte) >= buf.Length) return 0; + buf[head++] = (byte) '$'; + + var size = LongToDecimalString(sizeof(WorkerVersion), buf, head); + if (size == 0) return 0; + head += size; + + if (head + 4 + sizeof(WorkerVersion) >= buf.Length) return 0; + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), val.DprWorkerId.guid); + head += sizeof(long); + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), val.Version); + head += sizeof(long); + + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + return head - offset; + } + + internal static unsafe int WriteRedisBulkString(IEnumerable val, byte[] buf, int offset) + { + var head = offset; + if (head + sizeof(byte) >= buf.Length) return 0; + buf[head++] = (byte) '$'; + + // Find size of encoding up front + var count = val.Count(); + var totalSize = sizeof(int) + count * sizeof(WorkerVersion); + + var size = LongToDecimalString(totalSize, buf, head); + if (size == 0) return 0; + head += size; + + if (head + 4 + totalSize >= buf.Length) return 0; + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(int)), count); + head += sizeof(int); + foreach (var wv in val) + { + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), wv.DprWorkerId.guid); + head += sizeof(long); + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), wv.Version); + head += sizeof(long); + } + + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + return head - offset; + } + + internal static int WriteRedisArrayHeader(int numElems, byte[] buf, int offset) + { + var head = offset; + if (head + 1 >= buf.Length) return 0; + buf[head++] = (byte) '*'; + + var size = LongToDecimalString(numElems, buf, head); + if (size == 0) return 0; + head += size; + + if (head + 2 >= buf.Length) return 0; + buf[head++] = (byte) '\r'; + buf[head++] = (byte) '\n'; + return head - offset; + } + + internal static int DictionarySerializedSize(IDictionary dict) + { + return sizeof(int) + dict.Count * 2 * sizeof(long); + } + + internal static int SerializeDictionary(IDictionary dict, byte[] buf, int head) + { + if (head + DictionarySerializedSize(dict) > buf.Length) return 0; + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(int)), dict.Count); + head += sizeof(int); + foreach (var entry in dict) + { + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), entry.Key.guid); + head += sizeof(long); + BitConverter.TryWriteBytes(new Span(buf, head, sizeof(long)), entry.Value); + head += sizeof(long); + } + + return head; + } + + public static int ReadDictionaryFromBytes(byte[] buf, int head, IDictionary result) + { + var size = BitConverter.ToInt32(buf, head); + head += sizeof(int); + for (var i = 0; i < size; i++) + { + var workerId = BitConverter.ToInt64(buf, head); + head += sizeof(long); + var val = BitConverter.ToInt64(buf, head); + head += sizeof(long); + if (result != null) + result[new DprWorkerId(workerId)] = val; + } + + return head; + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/FASTER.libdpr.test.csproj b/cs/research/libdpr/test/FASTER.libdpr.test/FASTER.libdpr.test.csproj new file mode 100644 index 000000000..9954319ae --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/FASTER.libdpr.test.csproj @@ -0,0 +1,22 @@ + + + + net7.0 + + false + + true + + + + + + + + + + + + + + diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/SimpleTests.cs b/cs/research/libdpr/test/FASTER.libdpr.test/SimpleTests.cs new file mode 100644 index 000000000..95df644f7 --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/SimpleTests.cs @@ -0,0 +1,198 @@ +using System.Threading; +using FASTER.core; +using NUnit.Framework; + +namespace FASTER.libdpr +{ + [TestFixture] + public class SimpleTests + { + private ManualResetEventSlim terminationToken; + private SimulatedDprFinderService simulatedFinderService = new(); + + [SetUp] + public void SetUp() + { + terminationToken = new ManualResetEventSlim(); + // Process add request in the background so we do not block the current thread + simulatedFinderService.ProcessInBackground(terminationToken); + } + + [TearDown] + public void TearDown() + { + terminationToken.Set(); + } + + private DprWorker ConstructWorker(long id, bool autoCompleteCheckpoints = true) + { + return new DprWorker( + new TestStateObject(new WorkerId(id), autoCompleteCheckpoints), + new EpochProtectedVersionScheme(new LightEpoch()), + new DprWorkerOptions + { + Me = new WorkerId(id), + DprFinder = new TestDprFinder(simulatedFinderService), + }); + } + + private void SendMessage(DprWorker from, + DprWorker to, + bool expected) + { + from.StartStep(); + var m = from.StateObject().GenerateMessageToSend(); + from.EndStepAndProduceTag(m.dprHeader); + var status = to.StartStepWithReceive(m.dprHeader); + Assert.AreEqual(expected, status); + if (status) + to.StateObject().Receive(m); + to.EndStep(); + } + + private void VerifyCommit(params (DprWorker, long)[] expected) + { + // Wait a bit for the DprFinder service to catch up + simulatedFinderService.NextBackgroundProcessComplete().GetAwaiter().GetResult(); + foreach (var (worker, version) in expected) + { + worker.ForceRefresh(); + Assert.AreEqual(version, worker.CommittedVersion()); + } + } + + [Test] + public void TestOneMessage() + { + var tested0 = ConstructWorker(0); + var tested1 = ConstructWorker(1); + + tested0.ConnectToCluster(); + tested1.ConnectToCluster(); + Assert.AreEqual(1, tested0.Version()); + Assert.AreEqual(1, tested1.Version()); + Assert.AreEqual(1, tested0.WorldLine()); + Assert.AreEqual(1, tested1.WorldLine()); + + SendMessage(tested0, tested1, true); + + tested1.ForceCheckpoint(); + Assert.AreEqual(2, tested1.Version()); + Assert.AreEqual(1, tested1.WorldLine()); + // Dependencies have not committed, so nothing should commit + VerifyCommit((tested1, 0)); + + tested0.ForceCheckpoint(); + Assert.AreEqual(2, tested0.Version()); + Assert.AreEqual(1, tested0.WorldLine()); + + VerifyCommit((tested0, 1), (tested1, 1)); + } + + [Test] + public void TestThreeServers() + { + var a = ConstructWorker(0, false); + a.ConnectToCluster(); + var b = ConstructWorker(1, false); + b.ConnectToCluster(); + var c = ConstructWorker(2, false); + c.ConnectToCluster(); + + // Construct a dependency graph without commiting anything + SendMessage(a, b, true); + a.ForceCheckpoint(); + b.ForceCheckpoint(); + c.ForceCheckpoint(); + SendMessage(b, a, true); + SendMessage(c, b, true); + SendMessage(a, c, true); + a.ForceCheckpoint(); + b.ForceCheckpoint(); + c.ForceCheckpoint(); + + // Nothing should commit + VerifyCommit((a, 0), (b, 0), (c, 0)); + + c.StateObject().CompleteCheckpoint(1); + // C should commit, but nothing else + VerifyCommit((a, 0), (b, 0), (c, 1)); + + b.StateObject().CompleteCheckpoint(1); + // B still has outstanding dependencies and therefore nothing would commit + VerifyCommit((a, 0), (b, 0), (c, 1)); + + a.StateObject().CompleteCheckpoint(1); + // Commits can now happen + VerifyCommit((a, 1), (b, 1), (c, 1)); + + b.StateObject().CompleteCheckpoint(2); + a.StateObject().CompleteCheckpoint(2); + // Nothing should commit because C still hasn't committed + VerifyCommit((a, 1), (b, 1), (c, 1)); + + c.StateObject().CompleteCheckpoint(2); + // Now everything should commit + VerifyCommit((a, 2), (b, 2), (c, 2)); + } + + [Test] + public void TestSimpleRecovery() + { + var a = ConstructWorker(0, false); + a.ConnectToCluster(); + var b = ConstructWorker(1, false); + b.ConnectToCluster(); + var c = ConstructWorker(2, false); + c.ConnectToCluster(); + + // Construct a dependency graph without commiting anything + SendMessage(a, b, true); + a.ForceCheckpoint(); + var a1State = a.StateObject().stateSerialNum; + b.ForceCheckpoint(); + var b1State = b.StateObject().stateSerialNum; + c.ForceCheckpoint(); + var c1State = c.StateObject().stateSerialNum; + + SendMessage(b, a, true); + SendMessage(c, b, true); + SendMessage(a, c, true); + a.ForceCheckpoint(); + var a2State = a.StateObject().stateSerialNum; + Assert.AreNotEqual(a1State, a2State); + b.ForceCheckpoint(); + var b2State = b.StateObject().stateSerialNum; + Assert.AreNotEqual(b1State, b2State); + c.ForceCheckpoint(); + var c2State = c.StateObject().stateSerialNum; + Assert.AreNotEqual(c1State, c2State); + + // Nothing should commit + VerifyCommit((a, 0), (b, 0), (c, 0)); + + c.StateObject().CompleteCheckpoint(1); + b.StateObject().CompleteCheckpoint(1); + a.StateObject().CompleteCheckpoint(1); + VerifyCommit((a, 1), (b, 1), (c, 1)); + + b.StateObject().CompleteCheckpoint(2); + a.StateObject().CompleteCheckpoint(2); + VerifyCommit((a, 1), (b, 1), (c, 1)); + + // Simulate a failure by reconnecting + c.ConnectToCluster(); + simulatedFinderService.NextBackgroundProcessComplete().GetAwaiter().GetResult(); + a.ForceRefresh(); + b.ForceRefresh(); + c.ForceRefresh(); + // Everyone should be back at the commit prefix with a larger worldline + Assert.AreEqual(a1State, a.StateObject().stateSerialNum); + Assert.AreEqual(2, a.WorldLine()); + Assert.AreEqual(b1State, b.StateObject().stateSerialNum); + Assert.AreEqual(2, b.WorldLine()); + Assert.AreEqual(c1State, c.StateObject().stateSerialNum); + Assert.AreEqual(2, c.WorldLine()); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/TestDprFinder.cs b/cs/research/libdpr/test/FASTER.libdpr.test/TestDprFinder.cs new file mode 100644 index 000000000..37d8b56a1 --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/TestDprFinder.cs @@ -0,0 +1,142 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FASTER.core; + +namespace FASTER.libdpr; + +public class SimulatedDprFinderService : IDisposable +{ + private IDevice frontDevice, backDevice; + + // Randomly reset to simulate DprFinder failure + private volatile GraphDprFinderBackend backend; + private TestPrecomputedResponse response; + private TaskCompletionSource nextProcess = new(); + + + public SimulatedDprFinderService() + { + frontDevice = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + backDevice = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + response = new TestPrecomputedResponse(); + backend = new GraphDprFinderBackend(new PingPongDevice(frontDevice, backDevice)); + backend.AddResponseObjectToPrecompute(response); + } + + public void Dispose() + { + frontDevice.Dispose(); + backDevice.Dispose(); + } + + public TestPrecomputedResponse GetResponseObject() => response; + + public GraphDprFinderBackend GetDprFinderBackend() + { + GraphDprFinderBackend reference = null; + do + { + reference = backend; + } while (reference == null); + + return reference; + } + + public void FailOver(int delayMilli) + { + backend = null; + Thread.Sleep(delayMilli); + var newResponse = new TestPrecomputedResponse(); + var newBackend = new GraphDprFinderBackend(new PingPongDevice(frontDevice, backDevice)); + newBackend.AddResponseObjectToPrecompute(newResponse); + backend = newBackend; + response = newResponse; + } + + public void ProcessOnce() + { + backend?.Process(); + } + + public Task NextBackgroundProcessComplete() => nextProcess.Task; + + public void ProcessInBackground(ManualResetEventSlim termination) + { + Task.Run(() => + { + while (!termination.IsSet) + { + var tcs = nextProcess; + nextProcess = new TaskCompletionSource(); + ProcessOnce(); + tcs.SetResult(); + } + + Dispose(); + }); + } +} + +public class TestDprFinder : DprFinderBase +{ + private SimulatedDprFinderService backend; + + public TestDprFinder(SimulatedDprFinderService backend) + { + this.backend = backend; + } + + public override void ReportNewPersistentVersion(long worldLine, WorkerVersion persisted, IEnumerable deps) + { + backend.GetDprFinderBackend().NewCheckpoint(worldLine, persisted, deps); + } + + protected override bool Sync(ClusterState stateToUpdate, Dictionary cutToUpdate) + { + var response = backend.GetResponseObject(); + try + { + response.rwLatch.EnterReadLock(); + if (response.currentCut.Count == 0) return false; + stateToUpdate.currentWorldLine = response.clusterState.currentWorldLine; + foreach (var entry in response.clusterState.worldLinePrefix) + stateToUpdate.worldLinePrefix.Add(entry.Key, entry.Value); + foreach (var entry in response.currentCut) + cutToUpdate.Add(entry.Key, entry.Value); + return true; + } + finally + { + response.rwLatch.ExitReadLock(); + } + } + + protected override void SendGraphReconstruction(WorkerId id, IStateObject stateObject) + { + var checkpoints = stateObject.GetUnprunedVersions(); + var service = backend.GetDprFinderBackend(); + foreach (var (bytes, offset) in checkpoints) + { + SerializationUtil.DeserializeCheckpointMetadata(new Span(bytes, offset, bytes.Length - offset), + out var worldLine, out var wv, out var deps); + service.NewCheckpoint(worldLine, wv, deps); + } + service.MarkWorkerAccountedFor(id); + } + + protected override void AddWorkerInternal(WorkerId id) + { + var manualResetEvent = new ManualResetEventSlim(); + backend.GetDprFinderBackend().AddWorker(id, _ => manualResetEvent.Set()); + manualResetEvent.Wait(); + } + + public override void RemoveWorker(WorkerId id) + { + var manualResetEvent = new ManualResetEventSlim(); + backend.GetDprFinderBackend().DeleteWorker(id, () => manualResetEvent.Set()); + manualResetEvent.Wait(); + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/TestPrecomputedResponse.cs b/cs/research/libdpr/test/FASTER.libdpr.test/TestPrecomputedResponse.cs new file mode 100644 index 000000000..81c387671 --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/TestPrecomputedResponse.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; + +namespace FASTER.libdpr +{ + + public class TestPrecomputedResponse : PrecomputedSyncResponseBase + { + public ClusterState clusterState; + public Dictionary currentCut = null; + + public override void ResetClusterState(ClusterState clusterState) + { + rwLatch.EnterWriteLock(); + this.clusterState = new ClusterState(); + this.clusterState.currentWorldLine = clusterState.currentWorldLine; + foreach (var entry in clusterState.worldLinePrefix) + this.clusterState.worldLinePrefix.Add(entry.Key, entry.Value); + currentCut = null; + rwLatch.ExitWriteLock(); + } + + public override void UpdateCut(Dictionary newCut) + { + rwLatch.EnterWriteLock(); + currentCut ??= new Dictionary(); + currentCut.Clear(); + foreach (var entry in newCut) + currentCut.Add(entry.Key, entry.Value); + rwLatch.ExitWriteLock(); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/TestStateObject.cs b/cs/research/libdpr/test/FASTER.libdpr.test/TestStateObject.cs new file mode 100644 index 000000000..f4f92e115 --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/TestStateObject.cs @@ -0,0 +1,107 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Threading; +namespace FASTER.libdpr; + +public class TestMessage +{ + internal byte[] dprHeader = new byte[1 << 15]; + internal WorkerId originator; + internal long originatorStateSerialNum; +} + +public class TestStateObject : IStateObject +{ + private struct TestCheckpointInfo + { + internal long stateSerialNum; + internal int numReceivedMessages; + internal byte[] dprMetadata; + } + + private WorkerId me; + private List receivedMessages = new(); + public long stateSerialNum; + private long persistedSerialNum; + private Dictionary checkpoints = new(); + private Dictionary pendingPersists; + + public TestStateObject(WorkerId me, bool autoCompleteCheckpoints = true) + { + this.me = me; + if (!autoCompleteCheckpoints) pendingPersists = new Dictionary(); + } + + public void Receive(TestMessage m) + { + Interlocked.Increment(ref stateSerialNum); + } + + public void DoLocalWork() + { + Interlocked.Increment(ref stateSerialNum); + } + + public TestMessage GenerateMessageToSend() + { + var incremented = Interlocked.Increment(ref stateSerialNum); + return new TestMessage + { + originator = me, + originatorStateSerialNum = incremented + }; + } + + public void PerformCheckpoint(long version, ReadOnlySpan metadata, Action onPersist) + { + // TODO(Tianyu): This might not be thread-safe in the new version of DPR that does not extend epoch protection + // into client code + var capturedState = stateSerialNum; + var capturedMessageCount = receivedMessages.Count; + var metadataArray = metadata.ToArray(); + var callback = () => + { + persistedSerialNum = capturedState; + checkpoints[version] = new TestCheckpointInfo + { + stateSerialNum = capturedState, + numReceivedMessages = capturedMessageCount, + dprMetadata = metadataArray + }; + onPersist(); + }; + + // Immediately complete checkpoint if auto completion is on + if (pendingPersists == null) + callback(); + else + pendingPersists[version] = callback; + } + + public void CompleteCheckpoint(long version) + { + Debug.Assert(pendingPersists != null); + pendingPersists[version](); + } + + public void RestoreCheckpoint(long version, out ReadOnlySpan metadata) + { + var restored = checkpoints[version]; + persistedSerialNum = stateSerialNum = restored.stateSerialNum; + receivedMessages.RemoveRange(restored.numReceivedMessages, + receivedMessages.Count - restored.numReceivedMessages); + metadata = restored.dprMetadata; + } + + public void PruneVersion(long version) + { + checkpoints.Remove(version); + } + + public IEnumerable<(byte[], int)> GetUnprunedVersions() + { + return checkpoints.Values.Select(info => ValueTuple.Create(info.dprMetadata, 0)); + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendConcurrentTest.cs b/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendConcurrentTest.cs new file mode 100644 index 000000000..13de5d356 --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendConcurrentTest.cs @@ -0,0 +1,221 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using FASTER.core; +using NUnit.Framework; + +namespace FASTER.libdpr +{ + internal class SimulatedWorker + { + private WorkerId me; + private List cluster; + private long worldLine, version, lastChecked; + private Dictionary> versions; + private Func backend; + private TestPrecomputedResponse response; + private double depProb; + private Random random; + private bool finished = false; + + public SimulatedWorker(WorkerId me, List cluster, Func backend, double depProb) + { + this.me = me; + worldLine = 1; + version = 1; + lastChecked = 0; + versions = new Dictionary>(); + this.backend = backend; + this.depProb = depProb; + this.cluster = cluster; + random = new Random(); + response = new TestPrecomputedResponse(); + backend().AddWorker(me); + backend().AddResponseObjectToPrecompute(response); + } + + private void SimulateOneVersion(bool generateDeps = true) + { + Thread.Sleep(random.Next(10, 20)); + var deps = new List(); + var wv = new WorkerVersion(me, version); + versions[wv] = deps; + if (generateDeps) + { + for (var i = 0; i < cluster.Count; i++) + { + var worker = cluster[random.Next(cluster.Count)]; + var depVersion = Interlocked.Read(ref worker.version); + if (me.Equals(worker.me)) continue; + if (random.NextDouble() >= depProb || worker.finished) + continue; + + if (depVersion > version) + { + // end version now + backend().NewCheckpoint(worldLine, wv, deps); + version = worker.version; + return; + } + deps.Add(new WorkerVersion(worker.me, depVersion)); + } + } + backend().NewCheckpoint(worldLine, wv, deps); + Interlocked.Increment(ref version); + } + + private void CheckInvariants() + { + response.rwLatch.EnterReadLock(); + var deserializedState = response.clusterState; + if (response.currentCut.Count == 0) + { + response.rwLatch.ExitReadLock(); + worldLine = deserializedState.currentWorldLine; + version = deserializedState.worldLinePrefix[me]; + versions = versions.Where(kv => kv.Key.Version <= version && kv.Key.Version > lastChecked) + .ToDictionary(kv => kv.Key, kv => kv.Value); + + foreach (var (wv, deps) in versions) + { + backend().NewCheckpoint(worldLine, wv, deps); + } + backend().MarkWorkerAccountedFor(me); + return; + } + + var deserializedCut = new Dictionary(response.currentCut); + response.rwLatch.ExitReadLock(); + + var persistedUntil = deserializedCut[me]; + // Guarantees should never regress, even if backend failed + Assert.GreaterOrEqual(persistedUntil, lastChecked); + // Check that all committed versions have persistent dependencies + for (var v = lastChecked + 1; v <= persistedUntil; v++) + { + if (!versions.TryGetValue(new WorkerVersion(me, v), out var deps)) continue; + foreach (var dep in deps) + { + Assert.LessOrEqual(dep.Version, cluster[(int) dep.WorkerId.guid].version); + } + } + lastChecked = persistedUntil; + } + + + public void Simulate(ManualResetEventSlim termination) + { + while (!termination.IsSet) + { + SimulateOneVersion(); + CheckInvariants(); + } + finished = true; + var lastVersion = version; + SimulateOneVersion(false); + while (lastChecked < lastVersion) + CheckInvariants(); + } + } + + internal class SimulatedCluster + { + // Randomly reset to simulate DprFinder failure + private SimulatedDprFinderService backend; + private Thread failOver, compute; + private double failureProb; + private List cluster; + + public SimulatedCluster(SimulatedDprFinderService backend, double failureProb, IEnumerable cluster) + { + this.backend = backend; + this.failureProb = failureProb; + this.cluster = cluster.ToList(); + } + + public GraphDprFinderBackend GetDprFinder() => backend.GetDprFinderBackend(); + + public void Simulate(int simulationTimeMilli) + { + var failOverTermination = new ManualResetEventSlim(); + var workerTermination = new ManualResetEventSlim(); + var backendTermination = new ManualResetEventSlim(); + failOver = new Thread(() => + { + var rand = new Random(); + // failure simulator terminate before worker threads are joined so they can at least have one failure-free + // version to ensure we make progress + while (!failOverTermination.IsSet) + { + Thread.Sleep(10); + if (rand.NextDouble() < failureProb) + backend.FailOver(5); + } + }); + compute = new Thread(() => + { + while (!backendTermination.IsSet) + backend.ProcessOnce(); + }); + compute.Start(); + failOver.Start(); + + var threads = new List(); + foreach (var worker in cluster) + { + var t = new Thread(() => worker.Simulate(workerTermination)); + threads.Add(t); + t.Start(); + } + + Thread.Sleep(simulationTimeMilli); + failOverTermination.Set(); + failOver.Join(); + + workerTermination.Set(); + foreach (var t in threads) + t.Join(); + + backendTermination.Set(); + compute.Join(); + } + } + + [TestFixture] + public class GraphDprFinderConcurrentTest + { + [Test] + public void ConcurrentTestDprFinderSmallNoFailure() + { + var testedBackend = new SimulatedDprFinderService(); + var clusterInfo = new List(); + for (var i = 0; i < 3; i++) + clusterInfo.Add(new SimulatedWorker(new WorkerId(i), clusterInfo, testedBackend.GetDprFinderBackend, 0.5)); + var testCluster = new SimulatedCluster(testedBackend, 0.0, clusterInfo); + testCluster.Simulate(1000); + } + + [Test] + public void ConcurrentTestDprFinderLargeNoFailure() + { + var testedBackend = new SimulatedDprFinderService(); + var clusterInfo = new List(); + for (var i = 0; i < 30; i++) + clusterInfo.Add(new SimulatedWorker(new WorkerId(i), clusterInfo, testedBackend.GetDprFinderBackend, 0.75)); + var testedCluster = new SimulatedCluster(testedBackend, 0.0, clusterInfo); + testedCluster.Simulate(30000); + } + + [Test] + public void ConcurrentTestDprFinderFailure() + { + var testedBackend = new SimulatedDprFinderService(); + var clusterInfo = new List(); + for (var i = 0; i < 10; i++) + clusterInfo.Add(new SimulatedWorker(new WorkerId(i), clusterInfo, testedBackend.GetDprFinderBackend, 0.75)); + var testedCluster = new SimulatedCluster(testedBackend, 0.05, clusterInfo); + testedCluster.Simulate(1000); + } + } +} \ No newline at end of file diff --git a/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendTest.cs b/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendTest.cs new file mode 100644 index 000000000..54ac5ad3b --- /dev/null +++ b/cs/research/libdpr/test/FASTER.libdpr.test/dprfinder/FinderBackendTest.cs @@ -0,0 +1,337 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using FASTER.core; +using NUnit.Framework; + +namespace FASTER.libdpr +{ + [TestFixture] + public class FinderBackendTest + { + private static void CheckClusterState(GraphDprFinderBackend backend, TestPrecomputedResponse response, long expectedWorldLine, + Dictionary expectedPrefix) + { + try + { + response.rwLatch.EnterReadLock(); + Assert.AreEqual(expectedWorldLine, response.clusterState.currentWorldLine); + Assert.AreEqual(expectedPrefix, response.clusterState.worldLinePrefix); + } + finally + { + response.rwLatch.ExitReadLock(); + } + + } + private static void CheckDprCut(GraphDprFinderBackend backend, TestPrecomputedResponse response, Dictionary expectedCut) + { + try + { + response.rwLatch.EnterReadLock(); + Assert.AreEqual(expectedCut, response.currentCut); + } + finally + { + response.rwLatch.ExitReadLock(); + } + } + + [Test] + public void TestDprFinderBackendSequential() + { + var localDevice1 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var localDevice2 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var testResponse = new TestPrecomputedResponse(); + var testedBackend = new GraphDprFinderBackend(new PingPongDevice(localDevice1, localDevice2)); + testedBackend.AddResponseObjectToPrecompute(testResponse); + + var A = new WorkerId(0); + var B = new WorkerId(1); + var C = new WorkerId(2); + + var addComplete = new CountdownEvent(3); + testedBackend.AddWorker(A, _ => addComplete.Signal()); + testedBackend.AddWorker(B, _ => addComplete.Signal()); + testedBackend.AddWorker(C, _ => addComplete.Signal()); + testedBackend.Process(); + addComplete.Wait(); + CheckClusterState(testedBackend, testResponse, 1, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + + var A1 = new WorkerVersion(A, 1); + var B1 = new WorkerVersion(B, 1); + var A2 = new WorkerVersion(A, 2); + var B2 = new WorkerVersion(B, 2); + var C2 = new WorkerVersion(C, 2); + + testedBackend.NewCheckpoint(1, A1, Enumerable.Empty()); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 0}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, B1, new[] {A1}); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, A2, new []{ A1, B2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, B2, new []{ B1, C2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, C2, new []{ A2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 2}, + {B, 2}, + {C, 2} + }); + + localDevice1.Dispose(); + localDevice2.Dispose(); + } + + [Test] + public void TestDprFinderBackendWithWorkerFailure() + { + var localDevice1 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var localDevice2 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var testResponse = new TestPrecomputedResponse(); + var testedBackend = new GraphDprFinderBackend(new PingPongDevice(localDevice1, localDevice2)); + testedBackend.AddResponseObjectToPrecompute(testResponse); + + var A = new WorkerId(0); + var B = new WorkerId(1); + var C = new WorkerId(2); + + var addComplete = new CountdownEvent(3); + testedBackend.AddWorker(A, _ => addComplete.Signal()); + testedBackend.AddWorker(B, _ => addComplete.Signal()); + testedBackend.AddWorker(C, _ => addComplete.Signal()); + testedBackend.Process(); + addComplete.Wait(); + CheckClusterState(testedBackend, testResponse, 1, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + + var A1 = new WorkerVersion(A, 1); + var B1 = new WorkerVersion(B, 1); + var A2 = new WorkerVersion(A, 2); + var B2 = new WorkerVersion(B, 2); + var C2 = new WorkerVersion(C, 2); + var A3 = new WorkerVersion(A, 3); + + + testedBackend.NewCheckpoint(1, A1, Enumerable.Empty()); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 0}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, B1, new[] {A1}); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, A2, new []{ A1, B2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, B2, new []{ B1, C2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + var restartDone = new ManualResetEventSlim(); + testedBackend.AddWorker(A, _ => restartDone.Set()); + testedBackend.Process(); + restartDone.Wait(); + CheckClusterState(testedBackend, testResponse, 2, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, C2, new []{ A2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + // C2 Should have be rejected and never commit + testedBackend.NewCheckpoint(2, A3, Enumerable.Empty()); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 3}, + {B, 1}, + {C, 0} + }); + + localDevice1.Dispose(); + localDevice2.Dispose(); + } + + [Test] + public void TestDprFinderBackendRestart() + { + var localDevice1 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var localDevice2 = new LocalMemoryDevice(1 << 20, 1 << 20, 1); + var testResponse = new TestPrecomputedResponse(); + var testedBackend = new GraphDprFinderBackend(new PingPongDevice(localDevice1, localDevice2)); + testedBackend.AddResponseObjectToPrecompute(testResponse); + + var A = new WorkerId(0); + var B = new WorkerId(1); + var C = new WorkerId(2); + + var addComplete = new CountdownEvent(3); + testedBackend.AddWorker(A, _ => addComplete.Signal()); + testedBackend.AddWorker(B, _ => addComplete.Signal()); + testedBackend.AddWorker(C, _ => addComplete.Signal()); + testedBackend.Process(); + addComplete.Wait(); + CheckClusterState(testedBackend, testResponse, 1, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + + + var A1 = new WorkerVersion(A, 1); + var B1 = new WorkerVersion(B, 1); + var A2 = new WorkerVersion(A, 2); + var B2 = new WorkerVersion(B, 2); + var C2 = new WorkerVersion(C, 2); + + testedBackend.NewCheckpoint(1, A1, Enumerable.Empty()); + testedBackend.NewCheckpoint(1, B1, new[] {A1}); + testedBackend.NewCheckpoint(1, A2, new []{ A1, B2 }); + testedBackend.NewCheckpoint(1, B2, new []{ B1, C2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + // Get a new test backend to simulate restart from disk + testedBackend = new GraphDprFinderBackend(new PingPongDevice(localDevice1, localDevice2)); + testResponse = new TestPrecomputedResponse(); + testedBackend.AddResponseObjectToPrecompute(testResponse); + CheckClusterState(testedBackend, testResponse, 1, new Dictionary + { + {A, 0}, + {B, 0}, + {C, 0} + }); + // Cut should be temporarily unavailable during recovery + CheckDprCut(testedBackend, testResponse, null); + + // Simulate resending of graph + testedBackend.NewCheckpoint(1, A2, new []{ B2 }); + testedBackend.MarkWorkerAccountedFor(A); + testedBackend.NewCheckpoint(1, B2, new []{ C2 }); + testedBackend.MarkWorkerAccountedFor(B); + testedBackend.MarkWorkerAccountedFor(C); + testedBackend.Process(); + + // We should reach the same cut when dpr finder recovery is complete + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 1}, + {B, 1}, + {C, 0} + }); + + testedBackend.NewCheckpoint(1, C2, new []{ A2 }); + testedBackend.Process(); + CheckDprCut(testedBackend, testResponse, new Dictionary + { + {A, 2}, + {B, 2}, + {C, 2} + }); + + localDevice1.Dispose(); + localDevice2.Dispose(); + } + } +} \ No newline at end of file diff --git a/cs/src/core/Allocator/AllocatorBase.cs b/cs/src/core/Allocator/AllocatorBase.cs index ed910d250..384625418 100644 --- a/cs/src/core/Allocator/AllocatorBase.cs +++ b/cs/src/core/Allocator/AllocatorBase.cs @@ -2100,6 +2100,7 @@ public void AsyncFlushPages(long fromAddress, long untilAddress, bool noFlush = ShiftFlushedUntilAddress(); continue; } + // Partial page starting point, need to wait until the // ongoing adjacent flush is completed to ensure correctness @@ -2123,7 +2124,9 @@ public void AsyncFlushPages(long fromAddress, long untilAddress, bool noFlush = } } else + { WriteAsync(flushPage, AsyncFlushPageCallback, asyncResult); + } } } diff --git a/cs/src/core/Allocator/WorkQueueLIFO.cs b/cs/src/core/Allocator/WorkQueueLIFO.cs index 5c7ee8355..261706c53 100644 --- a/cs/src/core/Allocator/WorkQueueLIFO.cs +++ b/cs/src/core/Allocator/WorkQueueLIFO.cs @@ -14,7 +14,7 @@ namespace FASTER.core /// Shared work queue that ensures one worker at any given time. Uses LIFO ordering of work. /// /// - class WorkQueueLIFO : IDisposable + public class WorkQueueLIFO : IDisposable { const int kMaxQueueSize = 1 << 30; readonly ConcurrentStack _queue; @@ -30,6 +30,9 @@ public WorkQueueLIFO(Action work) _disposed = false; } + /// + /// Dispose + /// public void Dispose() { _disposed = true; diff --git a/cs/src/core/Device/ManagedLocalStorageDevice.cs b/cs/src/core/Device/ManagedLocalStorageDevice.cs index 1755e2573..ba10f9f1c 100644 --- a/cs/src/core/Device/ManagedLocalStorageDevice.cs +++ b/cs/src/core/Device/ManagedLocalStorageDevice.cs @@ -23,6 +23,7 @@ public sealed class ManagedLocalStorageDevice : StorageDeviceBase private readonly SafeConcurrentDictionary, AsyncPool)> logHandles; private readonly SectorAlignedBufferPool pool; + /// /// Number of pending reads on device /// @@ -77,11 +78,25 @@ public override void Reset() } } } + /// // We do not throttle ManagedLocalStorageDevice because our AsyncPool of handles takes care of this public override bool Throttle() => false; + public static void RemoveIfPresent(string filename) + { + FileInfo fi = new(filename); // may not exist + DirectoryInfo di = fi.Directory; + if (!di.Exists) return; + + string bareName = fi.Name; + + foreach (FileInfo item in di.GetFiles(bareName + "*")) + File.Delete(item.FullName); + } + + private void RecoverFiles() { FileInfo fi = new(FileName); // may not exist @@ -189,7 +204,7 @@ public override void ReadAsync(int segmentId, ulong sourceAddress, return; } - _ = Task.Run(async () => + Task.Factory.StartNew(async () => { if (!gotHandle) { @@ -267,7 +282,7 @@ public override void ReadAsync(int segmentId, ulong sourceAddress, // Issue user callback callback(errorCode, (uint)numBytes, context); } - }); + }, CancellationToken.None, TaskCreationOptions.None, TaskScheduler.Default); } /// @@ -351,7 +366,7 @@ public override void WriteAsync(IntPtr sourceAddress, return; } - _ = Task.Run(async () => + Task.Factory.StartNew(async () => { if (!gotHandle) { @@ -429,7 +444,7 @@ public override void WriteAsync(IntPtr sourceAddress, // Issue user callback callback(errorCode, numBytesToWrite, context); } - }); + }, CancellationToken.None, TaskCreationOptions.None, TaskScheduler.Default); } /// @@ -481,6 +496,10 @@ public override long GetFileSize(int segment) public override void Dispose() { _disposed = true; + // reusableSchedulers.Return(scheduler); + // if (Interlocked.Decrement(ref instanceCount) == 0) + // reusableSchedulers.DisposeAllResources(); + foreach (var entry in logHandles) { entry.Value.Item1.Dispose(); diff --git a/cs/src/core/Epochs/EpochProtectedVersionScheme.cs b/cs/src/core/Epochs/EpochProtectedVersionScheme.cs index 43c8d8a64..014a02e1c 100644 --- a/cs/src/core/Epochs/EpochProtectedVersionScheme.cs +++ b/cs/src/core/Epochs/EpochProtectedVersionScheme.cs @@ -4,255 +4,18 @@ using System; using System.Diagnostics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Threading; +using Microsoft.Extensions.Logging.Abstractions; namespace FASTER.core { - /// - /// The current state of a state-machine operation such as a checkpoint. - /// - [StructLayout(LayoutKind.Explicit, Size = 8)] - public struct VersionSchemeState - { - /// - /// Special value denoting that the version state machine is at rest in stable state - /// - public const byte REST = 0; - const int kTotalSizeInBytes = 8; - const int kTotalBits = kTotalSizeInBytes * 8; - - // Phase - const int kPhaseBits = 8; - const int kPhaseShiftInWord = kTotalBits - kPhaseBits; - const long kPhaseMaskInWord = ((1L << kPhaseBits) - 1) << kPhaseShiftInWord; - const long kPhaseMaskInInteger = (1L << kPhaseBits) - 1; - - // Version - const int kVersionBits = kPhaseShiftInWord; - const long kVersionMaskInWord = (1L << kVersionBits) - 1; - - /// Internal intermediate state of state machine - private const byte kIntermediateMask = 128; - - [FieldOffset(0)] internal long Word; - - /// - /// Custom phase marker denoting where in a state machine EPVS is in right now - /// - public byte Phase - { - get { return (byte)((Word >> kPhaseShiftInWord) & kPhaseMaskInInteger); } - set - { - Word &= ~kPhaseMaskInWord; - Word |= (((long)value) & kPhaseMaskInInteger) << kPhaseShiftInWord; - } - } - - /// - /// whether EPVS is in intermediate state now (transitioning between two states) - public bool IsIntermediate() => (Phase & kIntermediateMask) != 0; - - /// - /// Version number of the current state - /// - public long Version - { - get { return Word & kVersionMaskInWord; } - set - { - Word &= ~kVersionMaskInWord; - Word |= value & kVersionMaskInWord; - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static VersionSchemeState Copy(ref VersionSchemeState other) - { - var info = default(VersionSchemeState); - info.Word = other.Word; - return info; - } - - /// - /// Make a state with the given phase and version - /// - /// - /// - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static VersionSchemeState Make(byte phase, long version) - { - var info = default(VersionSchemeState); - info.Phase = phase; - info.Version = version; - return info; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static VersionSchemeState MakeIntermediate(VersionSchemeState state) - => Make((byte)(state.Phase | kIntermediateMask), state.Version); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void RemoveIntermediate(ref VersionSchemeState state) - { - state.Phase = (byte)(state.Phase & ~kIntermediateMask); - } - - internal static bool Equal(VersionSchemeState s1, VersionSchemeState s2) - { - return s1.Word == s2.Word; - } - - /// - public override string ToString() - { - return $"[{Phase},{Version}]"; - } - - /// - /// Compare the current to for equality if obj is also a - /// - public override bool Equals(object obj) - { - return obj is VersionSchemeState other && Equals(other); - } - - /// - public override int GetHashCode() - { - return Word.GetHashCode(); - } - - /// - /// Compare the current to for equality - /// - private bool Equals(VersionSchemeState other) - { - return Word == other.Word; - } - - /// - /// Equals - /// - public static bool operator ==(VersionSchemeState left, VersionSchemeState right) - { - return left.Equals(right); - } - - /// - /// Not Equals - /// - public static bool operator !=(VersionSchemeState left, VersionSchemeState right) - { - return !(left == right); - } - } - - /// - /// A version state machine specifies a sequence of transitions to a new version - /// - public abstract class VersionSchemeStateMachine - { - private long toVersion; - /// - /// The actual version this state machine is advancing to, or -1 if not yet determined - /// - protected internal long actualToVersion = -1; - - /// - /// Constructs a new version state machine for transition to the given version - /// - /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version - protected VersionSchemeStateMachine(long toVersion = -1) - { - this.toVersion = toVersion; - actualToVersion = toVersion; - } - - /// - /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version - public long ToVersion() => toVersion; - - /// - /// Given the current state, compute the next state the version scheme should enter, if any. - /// - /// the current state - /// the next state, if any - /// whether a state transition is possible at this moment - public abstract bool GetNextStep(VersionSchemeState currentState, out VersionSchemeState nextState); - - /// - /// Code block to execute before entering a state. Guaranteed to execute in a critical section with mutual - /// exclusion with other transitions or EPVS-protected code regions - /// - /// the current state - /// the state transitioning to - public abstract void OnEnteringState(VersionSchemeState fromState, VersionSchemeState toState); - - /// - /// Code block to execute after entering the state. Execution here may interleave with other EPVS-protected - /// code regions. This can be used to collaborative perform heavyweight transition work without blocking - /// progress of other threads. - /// - /// the current state - public abstract void AfterEnteringState(VersionSchemeState state); - } - - internal class SimpleVersionSchemeStateMachine : VersionSchemeStateMachine - { - private Action criticalSection; - - public SimpleVersionSchemeStateMachine(Action criticalSection, long toVersion = -1) : base(toVersion) - { - this.criticalSection = criticalSection; - } - - public override bool GetNextStep(VersionSchemeState currentState, out VersionSchemeState nextState) - { - Debug.Assert(currentState.Phase == VersionSchemeState.REST); - nextState = VersionSchemeState.Make(VersionSchemeState.REST, ToVersion() == -1 ? currentState.Version + 1 : ToVersion()); - return true; - } - - public override void OnEnteringState(VersionSchemeState fromState, VersionSchemeState toState) - { - Debug.Assert(fromState.Phase == VersionSchemeState.REST && toState.Phase == VersionSchemeState.REST); - criticalSection(fromState.Version, toState.Version); - } - - public override void AfterEnteringState(VersionSchemeState state) { } - } - - /// - /// Status for state machine execution - /// - public enum StateMachineExecutionStatus - { - /// - /// execution successful - /// - OK, - /// - /// execution unsuccessful but may be retried - /// - RETRY, - /// - /// execution failed and should not be retried - /// - FAIL - } - /// /// Epoch Protected Version Scheme /// - public class EpochProtectedVersionScheme + public class EpochProtectedVersionScheme : VersionSchemeBase { private LightEpoch epoch; - private VersionSchemeState state; - private VersionSchemeStateMachine currentMachine; - + /// /// Construct a new EPVS backed by the given epoch framework. Multiple EPVS instances can share an underlying /// epoch framework (WARNING: re-entrance is not yet supported, so nested protection of these shared instances @@ -262,42 +25,36 @@ public class EpochProtectedVersionScheme public EpochProtectedVersionScheme(LightEpoch epoch) { this.epoch = epoch; - state = VersionSchemeState.Make(VersionSchemeState.REST, 1); - currentMachine = null; } - - /// - /// the current state - public VersionSchemeState CurrentState() => state; - - // Atomic transition from expectedState -> nextState - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private bool MakeTransition(VersionSchemeState expectedState, VersionSchemeState nextState) + + /// Get the underlying epoch framework. For advanced use cases only + /// the underlying epoch framework + public LightEpoch GetUnderlyingEpoch() => epoch; + + public override void SignalStepAvailable(LightEpoch.EpochContext context = null) { - if (Interlocked.CompareExchange(ref state.Word, nextState.Word, expectedState.Word) != expectedState.Word) - return false; - Debug.WriteLine("Moved to {0}, {1}", nextState.Phase, nextState.Version); - return true; + TryStepStateMachine(null, context); } + /// /// Enter protection on the current thread. During protection, no version transition is possible. For the system /// to make progress, protection must be later relinquished on the same thread using Leave() or Refresh() /// /// the state of the EPVS as of protection, which extends until the end of protection - public VersionSchemeState Enter() + public override VersionSchemeState Enter(LightEpoch.EpochContext context = null) { - epoch.Resume(); - TryStepStateMachine(); + epoch.Resume(context); + TryStepStateMachine(null, context); VersionSchemeState result; while (true) { result = state; if (!result.IsIntermediate()) break; - epoch.Suspend(); + epoch.Suspend(context); Thread.Yield(); - epoch.Resume(); + epoch.Resume(context); } return result; @@ -307,19 +64,19 @@ public VersionSchemeState Enter() /// Refreshes protection --- equivalent to dropping and immediately reacquiring protection, but more performant. /// /// the state of the EPVS as of protection, which extends until the end of protection - public VersionSchemeState Refresh() + public override VersionSchemeState Refresh(LightEpoch.EpochContext context = null) { - epoch.ProtectAndDrain(); + epoch.ProtectAndDrain(context); VersionSchemeState result = default; - TryStepStateMachine(); + TryStepStateMachine(null, context); while (true) { result = state; if (!result.IsIntermediate()) break; - epoch.Suspend(); + epoch.Suspend(context); Thread.Yield(); - epoch.Resume(); + epoch.Resume(context); } return result; } @@ -327,12 +84,22 @@ public VersionSchemeState Refresh() /// /// Drop protection of the current thread /// - public void Leave() + public override void Leave(LightEpoch.EpochContext context = null) + { + epoch.Suspend(context); + } + + // Atomic transition from expectedState -> nextState + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool MakeTransition(VersionSchemeState expectedState, VersionSchemeState nextState) { - epoch.Suspend(); + if (Interlocked.CompareExchange(ref state.Word, nextState.Word, expectedState.Word) != expectedState.Word) + return false; + Debug.WriteLine("Moved to {0}, {1}", nextState.Phase, nextState.Version); + return true; } - internal void TryStepStateMachine(VersionSchemeStateMachine expectedMachine = null) + protected override void TryStepStateMachine(VersionSchemeStateMachine expectedMachine, LightEpoch.EpochContext context) { var machineLocal = currentMachine; var oldState = state; @@ -360,17 +127,17 @@ internal void TryStepStateMachine(VersionSchemeStateMachine expectedMachine = nu if (!MakeTransition(oldState, intermediate)) return; // Avoid upfront memory allocation by going to a function - StepMachineHeavy(machineLocal, oldState, nextState); + StepMachineHeavy(machineLocal, oldState, nextState, context); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void StepMachineHeavy(VersionSchemeStateMachine machineLocal, VersionSchemeState old, VersionSchemeState next) + private void StepMachineHeavy(VersionSchemeStateMachine machineLocal, VersionSchemeState old, VersionSchemeState next, LightEpoch.EpochContext context) { // // Resume epoch to ensure that state machine is able to make progress // if this thread is the only active thread. Also, StepMachineHeavy calls BumpCurrentEpoch, which requires a protected thread. - bool isProtected = epoch.ThisInstanceProtected(); + bool isProtected = epoch.ThisInstanceProtected(context); if (!isProtected) - epoch.Resume(); + epoch.Resume(context); try { epoch.BumpCurrentEpoch(() => @@ -379,114 +146,14 @@ private void StepMachineHeavy(VersionSchemeStateMachine machineLocal, VersionSch var success = MakeTransition(VersionSchemeState.MakeIntermediate(old), next); machineLocal.AfterEnteringState(next); Debug.Assert(success); - TryStepStateMachine(machineLocal); - }); + TryStepStateMachine(machineLocal, context); + }, context); } finally { if (!isProtected) - epoch.Suspend(); - } - } - - /// - /// Signals to EPVS that a new step is available in the state machine. This is useful when the state machine - /// delays a step (e.g., while waiting on IO to complete) and invoked after the step is available, so the - /// state machine can make progress even without active threads entering and leaving the system. There is no - /// need to invoke this method if steps are always available. - /// - public void SignalStepAvailable() - { - TryStepStateMachine(); - } - - /// - /// Attempt to start executing the given state machine. - /// - /// state machine to execute - /// - /// whether the state machine is successfully started (OK), - /// cannot be started due to an active state machine (RETRY), - /// or cannot be started because the version has advanced past the target version specified (FAIL) - /// - public StateMachineExecutionStatus TryExecuteStateMachine(VersionSchemeStateMachine stateMachine) - { - if (stateMachine.ToVersion() != -1 && stateMachine.ToVersion() <= state.Version) return StateMachineExecutionStatus.FAIL; - var actualStateMachine = Interlocked.CompareExchange(ref currentMachine, stateMachine, null); - if (actualStateMachine == null) - { - // Compute the actual ToVersion of state machine - stateMachine.actualToVersion = - stateMachine.ToVersion() == -1 ? state.Version + 1 : stateMachine.ToVersion(); - // Trigger one initial step to begin the process - TryStepStateMachine(stateMachine); - return StateMachineExecutionStatus.OK; - } - - // Otherwise, need to check that we are not a duplicate attempt to increment version - if (stateMachine.ToVersion() != -1 && actualStateMachine.actualToVersion >= stateMachine.ToVersion()) - return StateMachineExecutionStatus.FAIL; - - return StateMachineExecutionStatus.RETRY; - } - - - /// - /// Start executing the given state machine - /// - /// state machine to start - /// whether to spin wait until version transition is complete - /// whether the state machine can be executed. If false, EPVS has advanced version past the target version specified - public bool ExecuteStateMachine(VersionSchemeStateMachine stateMachine, bool spin = false) - { - if (epoch.ThisInstanceProtected()) - throw new InvalidOperationException("unsafe to execute a state machine blockingly when under protection"); - StateMachineExecutionStatus status; - do - { - status = TryExecuteStateMachine(stateMachine); - } while (status == StateMachineExecutionStatus.RETRY); - - if (status != StateMachineExecutionStatus.OK) return false; - - if (spin) - { - while (state.Version != stateMachine.actualToVersion || state.Phase != VersionSchemeState.REST) - { - TryStepStateMachine(); - Thread.Yield(); - } + epoch.Suspend(context); } - - return true; } - - /// - /// Advance the version with a single critical section to the requested version. - /// - /// critical section to execute, with old version and new (target) version as arguments - /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version - /// - /// whether the state machine is successfully started (OK), - /// cannot be started due to an active state machine (RETRY), - /// or cannot be started because the version has advanced past the target version specified (FAIL) - /// - public StateMachineExecutionStatus TryAdvanceVersionWithCriticalSection(Action criticalSection, long targetVersion = -1) - { - return TryExecuteStateMachine(new SimpleVersionSchemeStateMachine(criticalSection, targetVersion)); - } - - /// - /// Advance the version with a single critical section to the requested version. - /// - /// critical section to execute, with old version and new (target) version as arguments - /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version - /// whether to spin wait until version transition is complete - /// whether the state machine can be executed. If false, EPVS has advanced version past the target version specified - public bool AdvanceVersionWithCriticalSection(Action criticalSection, long targetVersion = -1, bool spin = false) - { - return ExecuteStateMachine(new SimpleVersionSchemeStateMachine(criticalSection, targetVersion), spin); - } - } } \ No newline at end of file diff --git a/cs/src/core/Epochs/IVersionScheme.cs b/cs/src/core/Epochs/IVersionScheme.cs new file mode 100644 index 000000000..83060ff46 --- /dev/null +++ b/cs/src/core/Epochs/IVersionScheme.cs @@ -0,0 +1,388 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core; + /// + /// The current state of a state-machine operation such as a checkpoint. + /// + [StructLayout(LayoutKind.Explicit, Size = 8)] + public struct VersionSchemeState + { + /// + /// Special value denoting that the version state machine is at rest in stable state + /// + public const byte REST = 0; + const int kTotalSizeInBytes = 8; + const int kTotalBits = kTotalSizeInBytes * 8; + + // Phase + const int kPhaseBits = 8; + const int kPhaseShiftInWord = kTotalBits - kPhaseBits; + const long kPhaseMaskInWord = ((1L << kPhaseBits) - 1) << kPhaseShiftInWord; + const long kPhaseMaskInInteger = (1L << kPhaseBits) - 1; + + // Version + const int kVersionBits = kPhaseShiftInWord; + const long kVersionMaskInWord = (1L << kVersionBits) - 1; + + /// Internal intermediate state of state machine + private const byte kIntermediateMask = 128; + + [FieldOffset(0)] internal long Word; + + /// + /// Custom phase marker denoting where in a state machine EPVS is in right now + /// + public byte Phase + { + get { return (byte)((Word >> kPhaseShiftInWord) & kPhaseMaskInInteger); } + set + { + Word &= ~kPhaseMaskInWord; + Word |= (((long)value) & kPhaseMaskInInteger) << kPhaseShiftInWord; + } + } + + /// + /// whether EPVS is in intermediate state now (transitioning between two states) + public bool IsIntermediate() => (Phase & kIntermediateMask) != 0; + + /// + /// Version number of the current state + /// + public long Version + { + get { return Word & kVersionMaskInWord; } + set + { + Word &= ~kVersionMaskInWord; + Word |= value & kVersionMaskInWord; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static VersionSchemeState Copy(ref VersionSchemeState other) + { + var info = default(VersionSchemeState); + info.Word = other.Word; + return info; + } + + /// + /// Make a state with the given phase and version + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static VersionSchemeState Make(byte phase, long version) + { + var info = default(VersionSchemeState); + info.Phase = phase; + info.Version = version; + return info; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static VersionSchemeState MakeIntermediate(VersionSchemeState state) + => Make((byte)(state.Phase | kIntermediateMask), state.Version); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void RemoveIntermediate(ref VersionSchemeState state) + { + state.Phase = (byte)(state.Phase & ~kIntermediateMask); + } + + internal static bool Equal(VersionSchemeState s1, VersionSchemeState s2) + { + return s1.Word == s2.Word; + } + + /// + public override string ToString() + { + return $"[{Phase},{Version}]"; + } + + /// + /// Compare the current to for equality if obj is also a + /// + public override bool Equals(object obj) + { + return obj is VersionSchemeState other && Equals(other); + } + + /// + public override int GetHashCode() + { + return Word.GetHashCode(); + } + + /// + /// Compare the current to for equality + /// + private bool Equals(VersionSchemeState other) + { + return Word == other.Word; + } + + /// + /// Equals + /// + public static bool operator ==(VersionSchemeState left, VersionSchemeState right) + { + return left.Equals(right); + } + + /// + /// Not Equals + /// + public static bool operator !=(VersionSchemeState left, VersionSchemeState right) + { + return !(left == right); + } + } + + /// + /// A version state machine specifies a sequence of transitions to a new version + /// + public abstract class VersionSchemeStateMachine + { + private long toVersion; + /// + /// The actual version this state machine is advancing to, or -1 if not yet determined + /// + protected internal long actualToVersion = -1; + + /// + /// Constructs a new version state machine for transition to the given version + /// + /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version + protected VersionSchemeStateMachine(long toVersion) + { + this.toVersion = toVersion; + actualToVersion = toVersion; + } + + /// + /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version + public long ToVersion() => toVersion; + + /// + /// Given the current state, compute the next state the version scheme should enter, if any. + /// + /// the current state + /// the next state, if any + /// whether a state transition is possible at this moment + public abstract bool GetNextStep(VersionSchemeState currentState, out VersionSchemeState nextState); + + /// + /// Code block to execute before entering a state. Guaranteed to execute in a critical section with mutual + /// exclusion with other transitions or EPVS-protected code regions + /// + /// the current state + /// the state transitioning to + public abstract void OnEnteringState(VersionSchemeState fromState, VersionSchemeState toState); + + /// + /// Code block to execute after entering the state. Execution here may interleave with other EPVS-protected + /// code regions. This can be used to collaborative perform heavyweight transition work without blocking + /// progress of other threads. + /// + /// the current state + public abstract void AfterEnteringState(VersionSchemeState state); + } + + internal class SimpleVersionSchemeStateMachine : VersionSchemeStateMachine + { + private Action criticalSection; + + public SimpleVersionSchemeStateMachine(Action criticalSection, long toVersion = -1) : base(toVersion) + { + this.criticalSection = criticalSection; + } + + public override bool GetNextStep(VersionSchemeState currentState, out VersionSchemeState nextState) + { + Debug.Assert(currentState.Phase == VersionSchemeState.REST); + nextState = VersionSchemeState.Make(VersionSchemeState.REST, ToVersion() == -1 ? currentState.Version + 1 : ToVersion()); + return true; + } + + public override void OnEnteringState(VersionSchemeState fromState, VersionSchemeState toState) + { + Debug.Assert(fromState.Phase == VersionSchemeState.REST && toState.Phase == VersionSchemeState.REST); + criticalSection(fromState.Version, toState.Version); + } + + public override void AfterEnteringState(VersionSchemeState state) { } + } + + /// + /// Status for state machine execution + /// + public enum StateMachineExecutionStatus + { + /// + /// execution successful + /// + OK, + /// + /// execution unsuccessful but may be retried + /// + RETRY, + /// + /// execution failed and should not be retried + /// + FAIL + } + +public interface IVersionScheme +{ + /// + /// the current state + VersionSchemeState CurrentState(); + + /// + /// Enter protection on the current thread. During protection, no version transition is possible. For the system + /// to make progress, protection must be later relinquished on the same thread using Leave() or Refresh() + /// + /// the state of the VersionScheme as of protection, which extends until the end of protection + VersionSchemeState Enter(LightEpoch.EpochContext context = null); + + /// + /// Refreshes protection --- equivalent to dropping and immediately reacquiring protection, but more performant. + /// + /// the state of the VersionScheme as of protection, which extends until the end of protection + VersionSchemeState Refresh(LightEpoch.EpochContext context = null); + + /// + /// Drop protection of the current thread + /// + public void Leave(LightEpoch.EpochContext context = null); + + /// + /// Signals to EPVS that a new step is available in the state machine. This is useful when the state machine + /// delays a step (e.g., while waiting on IO to complete) and invoked after the step is available, so the + /// state machine can make progress even without active threads entering and leaving the system. There is no + /// need to invoke this method if steps are always available. + /// + public void SignalStepAvailable(LightEpoch.EpochContext context = null); + + /// + /// Attempt to start executing the given state machine. + /// + /// state machine to execute + /// + /// whether the state machine is successfully started (OK), + /// cannot be started due to an active state machine (RETRY), + /// or cannot be started because the version has advanced past the target version specified (FAIL) + /// + public StateMachineExecutionStatus TryExecuteStateMachine(VersionSchemeStateMachine stateMachine); + + /// + /// Start executing the given state machine. May block forever if called under protection. Instead, use + /// TryExecuteStateMachine if in this situation and relinquish protection between retries. + /// + /// state machine to start + /// whether to spin wait until version transition is complete + /// whether the state machine can be executed. If false, EPVS has advanced version past the target version specified + public bool ExecuteStateMachine(VersionSchemeStateMachine stateMachine, bool spin = false); + + /// + /// Advance the version with a single critical section to the requested version. + /// + /// critical section to execute, with old version and new (target) version as arguments + /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version + /// + /// whether the state machine is successfully started (OK), + /// cannot be started due to an active state machine (RETRY), + /// or cannot be started because the version has advanced past the target version specified (FAIL) + /// + public StateMachineExecutionStatus TryAdvanceVersionWithCriticalSection(Action criticalSection, + long targetVersion = -1); + + /// + /// Advance the version with a single critical section to the requested version. + /// + /// critical section to execute, with old version and new (target) version as arguments + /// version to transition to, or -1 if unconditionally transitioning to an unspecified next version + /// whether to spin wait until version transition is complete + /// whether the state machine can be executed. If false, EPVS has advanced version past the target version specified + public bool AdvanceVersionWithCriticalSection(Action criticalSection, long targetVersion = -1, + bool spin = false); +} + +public abstract class VersionSchemeBase : IVersionScheme +{ + protected VersionSchemeState state = VersionSchemeState.Make(VersionSchemeState.REST, 1); + protected VersionSchemeStateMachine currentMachine = null; + + protected abstract void TryStepStateMachine(VersionSchemeStateMachine expectedMachine = null, LightEpoch.EpochContext context = null); + + public VersionSchemeState CurrentState() => state; + + public abstract VersionSchemeState Enter(LightEpoch.EpochContext context = null); + + public abstract VersionSchemeState Refresh(LightEpoch.EpochContext context = null); + + public abstract void Leave(LightEpoch.EpochContext context = null); + + public abstract void SignalStepAvailable(LightEpoch.EpochContext context = null); + + public StateMachineExecutionStatus TryExecuteStateMachine(VersionSchemeStateMachine stateMachine) + { + if (stateMachine.ToVersion() != -1 && stateMachine.ToVersion() <= state.Version) return StateMachineExecutionStatus.FAIL; + var actualStateMachine = Interlocked.CompareExchange(ref currentMachine, stateMachine, null); + if (actualStateMachine == null) + { + // Compute the actual ToVersion of state machine + stateMachine.actualToVersion = + stateMachine.ToVersion() == -1 ? state.Version + 1 : stateMachine.ToVersion(); + // Trigger one initial step to begin the process + TryStepStateMachine(stateMachine); + return StateMachineExecutionStatus.OK; + } + + // Otherwise, need to check that we are not a duplicate attempt to increment version + if (stateMachine.ToVersion() != -1 && actualStateMachine.actualToVersion >= stateMachine.ToVersion()) + return StateMachineExecutionStatus.FAIL; + + return StateMachineExecutionStatus.RETRY; + } + + public bool ExecuteStateMachine(VersionSchemeStateMachine stateMachine, bool spin = false) + { + StateMachineExecutionStatus status; + do + { + status = TryExecuteStateMachine(stateMachine); + } while (status == StateMachineExecutionStatus.RETRY); + + if (status != StateMachineExecutionStatus.OK) return false; + + if (spin) + { + while (state.Version != stateMachine.actualToVersion || state.Phase != VersionSchemeState.REST) + { + TryStepStateMachine(); + Thread.Yield(); + } + } + + return true; + } + + public StateMachineExecutionStatus TryAdvanceVersionWithCriticalSection(Action criticalSection, long targetVersion = -1) + { + return TryExecuteStateMachine(new SimpleVersionSchemeStateMachine(criticalSection, targetVersion)); + } + + public bool AdvanceVersionWithCriticalSection(Action criticalSection, long targetVersion = -1, bool spin = false) + { + return ExecuteStateMachine(new SimpleVersionSchemeStateMachine(criticalSection, targetVersion), spin); + } +} diff --git a/cs/src/core/Epochs/LightEpoch.cs b/cs/src/core/Epochs/LightEpoch.cs index 01ca58c06..645378f8a 100644 --- a/cs/src/core/Epochs/LightEpoch.cs +++ b/cs/src/core/Epochs/LightEpoch.cs @@ -17,39 +17,122 @@ public unsafe sealed class LightEpoch /// /// Store thread-static metadata separately; see https://github.com/microsoft/FASTER/pull/746 /// - private class Metadata + public class EpochContext { /// /// Managed thread id of this thread /// - [ThreadStatic] - internal static int threadId; + [ThreadStatic] internal static int threadId; /// /// Start offset to reserve entry in the epoch table /// - [ThreadStatic] - internal static ushort startOffset1; + [ThreadStatic] internal static ushort threadLocalStartOffset1; /// /// Alternate start offset to reserve entry in the epoch table (to reduce probing if slot is already filled) /// - [ThreadStatic] - internal static ushort startOffset2; + [ThreadStatic] internal static ushort threadLocalStartOffset2; /// /// A thread's entry in the epoch table. /// - [ThreadStatic] - internal static int threadEntryIndex; + [ThreadStatic] internal static int threadLocalThreadEntryIndex; /// /// Number of instances using this entry /// - [ThreadStatic] - internal static int threadEntryIndexCount; + [ThreadStatic] internal static int threadLocalEntryIndexCount; + + /// + /// custom id of this epochParticipant, + /// + public int customId; + + internal ushort startOffset1; + internal ushort startOffset2; + internal int threadEntryIndex; + internal int threadEntryIndexCount; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static long GetId(EpochContext context) + { + if (context == null) + return threadId; + // Ensure threadId and customId are complete disjoint + return ((long)context.customId << 32) | 1L; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ushort GetStartOffset1(EpochContext context) => + context?.startOffset1 ?? threadLocalStartOffset1; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ushort GetStartOffset2(EpochContext context) => + context?.startOffset2 ?? threadLocalStartOffset2; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetThreadEntryIndex(EpochContext context) => + context?.threadEntryIndex ?? threadLocalThreadEntryIndex; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int GetThreadEntryIndexCount(EpochContext context) => + context?.threadEntryIndexCount ?? threadLocalEntryIndexCount; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetThreadEntryIndex(EpochContext context, int i) + { + if (context == null) + threadLocalThreadEntryIndex = i; + else + context.threadEntryIndex = i; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetStartOffset1(EpochContext context, ushort i) + { + if (context == null) + threadLocalStartOffset1 = i; + else + context.startOffset1 = i; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetStartOffset2(EpochContext context, ushort i) + { + if (context == null) + threadLocalStartOffset2 = i; + else + context.startOffset2 = i; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int IncrementThreadIndexEntryCount(EpochContext context) + { + if (context == null) + return ++threadLocalEntryIndexCount; + return ++context.threadEntryIndexCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int DecrementThreadIndexEntryCount(EpochContext context) + { + if (context == null) + return --threadLocalEntryIndexCount; + return --context.threadEntryIndexCount; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void InvalidateContext(EpochContext context) + { + if (context == null) + threadLocalThreadEntryIndex = 0; + else + context.threadEntryIndex = 0; + } } + /// /// Size of cache line in bytes /// @@ -74,6 +157,7 @@ private class Metadata /// Thread protection status entries. /// readonly Entry[] tableRaw; + readonly Entry* tableAligned; #if !NET5_0_OR_GREATER GCHandle tableHandle; @@ -90,6 +174,7 @@ private class Metadata /// Marked volatile to ensure latest value is seen by the last suspended thread. /// volatile int drainCount = 0; + readonly EpochActionPair[] drainList = new EpochActionPair[kDrainListSize]; /// @@ -169,14 +254,15 @@ public void Dispose() /// Check whether current epoch instance is protected on this thread /// /// Result of the check - public bool ThisInstanceProtected() + public bool ThisInstanceProtected(EpochContext context = null) { - int entry = Metadata.threadEntryIndex; + int entry = EpochContext.GetThreadEntryIndex(context); if (kInvalidIndex != entry) { - if ((*(tableAligned + entry)).threadId == entry) + if ((*(tableAligned + entry)).contextId == entry) return true; } + return false; } @@ -185,12 +271,12 @@ public bool ThisInstanceProtected() /// /// Current epoch [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void ProtectAndDrain() + public void ProtectAndDrain(EpochContext context = null) { - int entry = Metadata.threadEntryIndex; + int entry = EpochContext.GetThreadEntryIndex(context); // Protect CurrentEpoch by making an entry for it in the non-static epoch table so ComputeNewSafeToReclaimEpoch() will see it. - (*(tableAligned + entry)).threadId = Metadata.threadEntryIndex; + (*(tableAligned + entry)).contextId = entry; (*(tableAligned + entry)).localCurrentEpoch = CurrentEpoch; if (drainCount > 0) @@ -203,29 +289,30 @@ public void ProtectAndDrain() /// Thread suspends its epoch entry /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Suspend() + public void Suspend(EpochContext context = null) { - Release(); - if (drainCount > 0) SuspendDrain(); + Release(context); + if (drainCount > 0) SuspendDrain(context); } /// /// Thread resumes its epoch entry /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Resume() + public void Resume(EpochContext context = null) { - Acquire(); - ProtectAndDrain(); + Acquire(context); + ProtectAndDrain(context); } /// /// Increment global current epoch /// /// - long BumpCurrentEpoch() + long BumpCurrentEpoch(EpochContext context = null) { - Debug.Assert(this.ThisInstanceProtected(), "BumpCurrentEpoch must be called on a protected thread"); + // TODO(Tianyu): Temporarily disabling because DARQ relies on bumping outside of protection + // Debug.Assert(this.ThisInstanceProtected(), "BumpCurrentEpoch must be called on a protected thread"); long nextEpoch = Interlocked.Increment(ref CurrentEpoch); if (drainCount > 0) @@ -240,9 +327,9 @@ long BumpCurrentEpoch() /// /// Trigger action /// - public void BumpCurrentEpoch(Action onDrain) + public void BumpCurrentEpoch(Action onDrain, EpochContext context = null) { - long PriorEpoch = BumpCurrentEpoch() - 1; + long PriorEpoch = BumpCurrentEpoch(context) - 1; int i = 0; while (true) @@ -250,7 +337,8 @@ public void BumpCurrentEpoch(Action onDrain) if (drainList[i].epoch == long.MaxValue) { // This was an empty slot. If it still is, assign this action/epoch to the slot. - if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, long.MaxValue) == long.MaxValue) + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, long.MaxValue) == + long.MaxValue) { drainList[i].action = onDrain; drainList[i].epoch = PriorEpoch; @@ -265,7 +353,8 @@ public void BumpCurrentEpoch(Action onDrain) if (triggerEpoch <= SafeToReclaimEpoch) { // This was a slot with an epoch that was safe to reclaim. If it still is, execute its trigger, then assign this action/epoch to the slot. - if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, triggerEpoch) == triggerEpoch) + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, triggerEpoch) == + triggerEpoch) { var triggerAction = drainList[i].action; drainList[i].action = onDrain; @@ -279,14 +368,14 @@ public void BumpCurrentEpoch(Action onDrain) if (++i == kDrainListSize) { // We are at the end of the drain list and found no empty or reclaimable slot. ProtectAndDrain, which should clear one or more slots. - ProtectAndDrain(); + ProtectAndDrain(context); i = 0; Thread.Yield(); } } // Now ProtectAndDrain, which may execute the action we just added. - ProtectAndDrain(); + ProtectAndDrain(context); } /// @@ -297,10 +386,10 @@ public void BumpCurrentEpoch(Action onDrain) /// Version /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Mark(int markerIdx, long version) + public void Mark(int markerIdx, long version, EpochContext context = null) { Debug.Assert(markerIdx < 6); - (*(tableAligned + Metadata.threadEntryIndex)).markers[markerIdx] = version; + (*(tableAligned + EpochContext.GetThreadEntryIndex(context))).markers[markerIdx] = version; } /// @@ -328,6 +417,7 @@ public bool CheckIsComplete(int markerIdx, long version) } } } + return true; } @@ -361,7 +451,7 @@ long ComputeNewSafeToReclaimEpoch(long currentEpoch) /// Take care of pending drains after epoch suspend /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - void SuspendDrain() + void SuspendDrain(EpochContext context) { while (drainCount > 0) { @@ -376,8 +466,9 @@ void SuspendDrain() return; } } - Resume(); - Release(); + + Resume(context); + Release(context); } } @@ -396,7 +487,8 @@ void Drain(long nextEpoch) if (trigger_epoch <= SafeToReclaimEpoch) { - if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, trigger_epoch) == trigger_epoch) + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, trigger_epoch) == + trigger_epoch) { // Store off the trigger action, then set epoch to int.MaxValue to mark this slot as "available for use". var trigger_action = drainList[i].action; @@ -416,39 +508,38 @@ void Drain(long nextEpoch) /// Thread acquires its epoch entry /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - void Acquire() + void Acquire(EpochContext context) { - if (Metadata.threadEntryIndex == kInvalidIndex) - Metadata.threadEntryIndex = ReserveEntryForThread(); + if (EpochContext.GetThreadEntryIndex(context) == kInvalidIndex) + ReserveEntryForThread(context); - Debug.Assert((*(tableAligned + Metadata.threadEntryIndex)).localCurrentEpoch == 0, + Debug.Assert((*(tableAligned + EpochContext.GetThreadEntryIndex(context))).localCurrentEpoch == 0, "Trying to acquire protected epoch. Make sure you do not re-enter FASTER from callbacks or IDevice implementations. If using tasks, use TaskCreationOptions.RunContinuationsAsynchronously."); // This corresponds to AnyInstanceProtected(). We do not mark "ThisInstanceProtected" until ProtectAndDrain(). - Metadata.threadEntryIndexCount++; + EpochContext.IncrementThreadIndexEntryCount(context); } /// /// Thread releases its epoch entry /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - void Release() + void Release(EpochContext context) { - int entry = Metadata.threadEntryIndex; + int entry = EpochContext.GetThreadEntryIndex(context); Debug.Assert((*(tableAligned + entry)).localCurrentEpoch != 0, "Trying to release unprotected epoch. Make sure you do not re-enter FASTER from callbacks or IDevice implementations. If using tasks, use TaskCreationOptions.RunContinuationsAsynchronously."); // Clear "ThisInstanceProtected()" (non-static epoch table) (*(tableAligned + entry)).localCurrentEpoch = 0; - (*(tableAligned + entry)).threadId = 0; + (*(tableAligned + entry)).contextId = 0; // Decrement "AnyInstanceProtected()" (static thread table) - Metadata.threadEntryIndexCount--; - if (Metadata.threadEntryIndexCount == 0) + if (EpochContext.DecrementThreadIndexEntryCount(context) == 0) { - (threadIndexAligned + Metadata.threadEntryIndex)->threadId = 0; - Metadata.threadEntryIndex = kInvalidIndex; + (threadIndexAligned + EpochContext.GetThreadEntryIndex(context))->contextId = 0; + EpochContext.InvalidateContext(context); } } @@ -457,48 +548,67 @@ void Release() /// thread will ever have ID 0. /// /// Reserved entry - static int ReserveEntry() + static void ReserveEntry(EpochContext context) { while (true) { // Try to acquire entry - if (0 == (threadIndexAligned + Metadata.startOffset1)->threadId) + if (0 == (threadIndexAligned + EpochContext.GetStartOffset1(context))->contextId) { if (0 == Interlocked.CompareExchange( - ref (threadIndexAligned + Metadata.startOffset1)->threadId, - Metadata.threadId, 0)) - return Metadata.startOffset1; + ref (threadIndexAligned + EpochContext.GetStartOffset1(context))->contextId, + EpochContext.GetId(context), 0)) + { + EpochContext.SetThreadEntryIndex(context, EpochContext.GetStartOffset1(context)); + return; + } } - if (Metadata.startOffset2 > 0) + if (EpochContext.GetStartOffset2(context) > 0) { // Try alternate entry - Metadata.startOffset1 = Metadata.startOffset2; - Metadata.startOffset2 = 0; + EpochContext.SetStartOffset1(context, EpochContext.GetStartOffset2(context)); + EpochContext.SetStartOffset2(context, 0); } - else Metadata.startOffset1++; // Probe next sequential entry - if (Metadata.startOffset1 > kTableSize) + else + { + // Probe next sequential entry + EpochContext.SetStartOffset1(context, (ushort) (EpochContext.GetStartOffset1(context) + 1)); + } + + if (EpochContext.GetStartOffset1(context)> kTableSize) { - Metadata.startOffset1 -= kTableSize; + EpochContext.SetStartOffset1(context, (ushort)(EpochContext.GetStartOffset1(context) - kTableSize)); Thread.Yield(); } } } - /// - /// A 32-bit murmur3 implementation. - /// - /// - /// - static int Murmur3(int h) + // TODO(Tianyu): This is GPT-generated so double check before merging + static int Murmur3Long(long value) { - uint a = (uint)h; - a ^= a >> 16; - a *= 0x85ebca6b; - a ^= a >> 13; - a *= 0xc2b2ae35; - a ^= a >> 16; - return (int)a; + ulong h = (ulong)value; + + // Mixing initial bits + h ^= h >> 33; + h *= 0xff51afd7ed558ccdUL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53UL; + h ^= h >> 33; + + // Since we need to return an int, and our hash is 64 bits, we mix down to 32 bits. + uint high = (uint)(h >> 32); + uint low = (uint)h; + uint mixed = high ^ low; + + // Final mix functions similar to the int version + mixed ^= mixed >> 16; + mixed *= 0x85ebca6b; + mixed ^= mixed >> 13; + mixed *= 0xc2b2ae35; + mixed ^= mixed >> 16; + + return (int)mixed; } /// @@ -506,16 +616,23 @@ static int Murmur3(int h) /// once for a thread. /// /// Reserved entry - static int ReserveEntryForThread() + static void ReserveEntryForThread(EpochContext context) { - if (Metadata.threadId == 0) // run once per thread for performance + if (context == null && EpochContext.threadId == 0) + { + EpochContext.threadId = Environment.CurrentManagedThreadId; + uint code = (uint)Murmur3Long(EpochContext.GetId(context)); + EpochContext.threadLocalStartOffset1 = (ushort)(1 + (code % kTableSize)); + EpochContext.threadLocalStartOffset2 = (ushort)(1 + ((code >> 16) % kTableSize)); + } + else if (context != null) { - Metadata.threadId = Environment.CurrentManagedThreadId; - uint code = (uint)Murmur3(Metadata.threadId); - Metadata.startOffset1 = (ushort)(1 + (code % kTableSize)); - Metadata.startOffset2 = (ushort)(1 + ((code >> 16) % kTableSize)); + uint code = (uint)Murmur3Long(EpochContext.GetId(context)); + context.startOffset1 = (ushort)(1 + (code % kTableSize)); + context.startOffset2 = (ushort)(1 + ((code >> 16) % kTableSize)); } - return ReserveEntry(); + + ReserveEntry(context); } /// @@ -527,29 +644,25 @@ struct Entry /// /// Thread-local value of epoch /// - [FieldOffset(0)] - public long localCurrentEpoch; + [FieldOffset(0)] public long localCurrentEpoch; /// /// ID of thread associated with this entry. /// - [FieldOffset(8)] - public int threadId; - - [FieldOffset(12)] - public int reentrant; + [FieldOffset(8)] public long contextId; - [FieldOffset(16)] - public fixed long markers[6]; + [FieldOffset(16)] public fixed long markers[6]; - public override string ToString() => $"lce = {localCurrentEpoch}, tid = {threadId}, re-ent {reentrant}"; + public override string ToString() => $"lce = {localCurrentEpoch}, tid = {contextId}"; } + struct EpochActionPair { public long epoch; public Action action; - public override string ToString() => $"epoch = {epoch}, action = {(action is null ? "n/a" : action.Method.ToString())}"; + public override string ToString() => + $"epoch = {epoch}, action = {(action is null ? "n/a" : action.Method.ToString())}"; } } -} +} \ No newline at end of file diff --git a/cs/src/core/Epochs/LightEpochOld.cs b/cs/src/core/Epochs/LightEpochOld.cs new file mode 100644 index 000000000..903cdcf7c --- /dev/null +++ b/cs/src/core/Epochs/LightEpochOld.cs @@ -0,0 +1,555 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Runtime.InteropServices; +using System.Runtime.CompilerServices; +using System.Diagnostics; + +namespace FASTER.core +{ + /// + /// Epoch protection + /// + public unsafe sealed class LightEpochOld + { + /// + /// Store thread-static metadata separately; see https://github.com/microsoft/FASTER/pull/746 + /// + private class Metadata + { + /// + /// Managed thread id of this thread + /// + [ThreadStatic] + internal static int threadId; + + /// + /// Start offset to reserve entry in the epoch table + /// + [ThreadStatic] + internal static ushort startOffset1; + + /// + /// Alternate start offset to reserve entry in the epoch table (to reduce probing if slot is already filled) + /// + [ThreadStatic] + internal static ushort startOffset2; + + /// + /// A thread's entry in the epoch table. + /// + [ThreadStatic] + internal static int threadEntryIndex; + + /// + /// Number of instances using this entry + /// + [ThreadStatic] + internal static int threadEntryIndexCount; + } + + /// + /// Size of cache line in bytes + /// + const int kCacheLineBytes = 64; + + /// + /// Default invalid index entry. + /// + const int kInvalidIndex = 0; + + /// + /// Default number of entries in the entries table + /// + static readonly ushort kTableSize = Math.Max((ushort)128, (ushort)(Environment.ProcessorCount * 2)); + + /// + /// Default drainlist size + /// + const int kDrainListSize = 16; + + /// + /// Thread protection status entries. + /// + readonly Entry[] tableRaw; + readonly Entry* tableAligned; +#if !NET5_0_OR_GREATER + GCHandle tableHandle; +#endif + + static readonly Entry[] threadIndex; + static readonly Entry* threadIndexAligned; +#if !NET5_0_OR_GREATER + static GCHandle threadIndexHandle; +#endif + + /// + /// List of action, epoch pairs containing actions to be performed when an epoch becomes safe to reclaim. + /// Marked volatile to ensure latest value is seen by the last suspended thread. + /// + volatile int drainCount = 0; + readonly EpochActionPair[] drainList = new EpochActionPair[kDrainListSize]; + + /// + /// Global current epoch value + /// + long CurrentEpoch; + + /// + /// Cached value of latest epoch that is safe to reclaim + /// + long SafeToReclaimEpoch; + + /// + /// Static constructor to setup shared cache-aligned space + /// to store per-entry count of instances using that entry + /// + static LightEpochOld() + { + long p; + + // Over-allocate to do cache-line alignment +#if NET5_0_OR_GREATER + threadIndex = GC.AllocateArray(kTableSize + 2, true); + p = (long)Unsafe.AsPointer(ref threadIndex[0]); +#else + threadIndex = new Entry[kTableSize + 2]; + threadIndexHandle = GCHandle.Alloc(threadIndex, GCHandleType.Pinned); + p = (long)threadIndexHandle.AddrOfPinnedObject(); +#endif + // Force the pointer to align to 64-byte boundaries + long p2 = (p + (kCacheLineBytes - 1)) & ~(kCacheLineBytes - 1); + threadIndexAligned = (Entry*)p2; + } + + /// + /// Instantiate the epoch table + /// + public LightEpochOld() + { + long p; + +#if NET5_0_OR_GREATER + tableRaw = GC.AllocateArray(kTableSize + 2, true); + p = (long)Unsafe.AsPointer(ref tableRaw[0]); +#else + // Over-allocate to do cache-line alignment + tableRaw = new Entry[kTableSize + 2]; + tableHandle = GCHandle.Alloc(tableRaw, GCHandleType.Pinned); + p = (long)tableHandle.AddrOfPinnedObject(); +#endif + // Force the pointer to align to 64-byte boundaries + long p2 = (p + (kCacheLineBytes - 1)) & ~(kCacheLineBytes - 1); + tableAligned = (Entry*)p2; + + CurrentEpoch = 1; + SafeToReclaimEpoch = 0; + + // Mark all epoch table entries as "available" + for (int i = 0; i < kDrainListSize; i++) + drainList[i].epoch = long.MaxValue; + drainCount = 0; + } + + /// + /// Clean up epoch table + /// + public void Dispose() + { +#if !NET5_0_OR_GREATER + tableHandle.Free(); +#endif + CurrentEpoch = 1; + SafeToReclaimEpoch = 0; + } + + /// + /// Check whether current epoch instance is protected on this thread + /// + /// Result of the check + public bool ThisInstanceProtected() + { + int entry = Metadata.threadEntryIndex; + if (kInvalidIndex != entry) + { + if ((*(tableAligned + entry)).threadId == entry) + return true; + } + return false; + } + + /// + /// Enter the thread into the protected code region + /// + /// Current epoch + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void ProtectAndDrain() + { + int entry = Metadata.threadEntryIndex; + + // Protect CurrentEpoch by making an entry for it in the non-static epoch table so ComputeNewSafeToReclaimEpoch() will see it. + (*(tableAligned + entry)).threadId = Metadata.threadEntryIndex; + (*(tableAligned + entry)).localCurrentEpoch = CurrentEpoch; + + if (drainCount > 0) + { + Drain((*(tableAligned + entry)).localCurrentEpoch); + } + } + + /// + /// Thread suspends its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Suspend() + { + Release(); + if (drainCount > 0) SuspendDrain(); + } + + /// + /// Thread resumes its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Resume() + { + Acquire(); + ProtectAndDrain(); + } + + /// + /// Increment global current epoch + /// + /// + long BumpCurrentEpoch() + { + Debug.Assert(this.ThisInstanceProtected(), "BumpCurrentEpoch must be called on a protected thread"); + long nextEpoch = Interlocked.Increment(ref CurrentEpoch); + + if (drainCount > 0) + Drain(nextEpoch); + + return nextEpoch; + } + + /// + /// Increment current epoch and associate trigger action + /// with the prior epoch + /// + /// Trigger action + /// + public void BumpCurrentEpoch(Action onDrain) + { + long PriorEpoch = BumpCurrentEpoch() - 1; + + int i = 0; + while (true) + { + if (drainList[i].epoch == long.MaxValue) + { + // This was an empty slot. If it still is, assign this action/epoch to the slot. + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, long.MaxValue) == long.MaxValue) + { + drainList[i].action = onDrain; + drainList[i].epoch = PriorEpoch; + Interlocked.Increment(ref drainCount); + break; + } + } + else + { + var triggerEpoch = drainList[i].epoch; + + if (triggerEpoch <= SafeToReclaimEpoch) + { + // This was a slot with an epoch that was safe to reclaim. If it still is, execute its trigger, then assign this action/epoch to the slot. + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, triggerEpoch) == triggerEpoch) + { + var triggerAction = drainList[i].action; + drainList[i].action = onDrain; + drainList[i].epoch = PriorEpoch; + triggerAction(); + break; + } + } + } + + if (++i == kDrainListSize) + { + // We are at the end of the drain list and found no empty or reclaimable slot. ProtectAndDrain, which should clear one or more slots. + ProtectAndDrain(); + i = 0; + Thread.Yield(); + } + } + + // Now ProtectAndDrain, which may execute the action we just added. + ProtectAndDrain(); + } + + /// + /// Mechanism for threads to mark some activity as completed until + /// some version by this thread + /// + /// ID of activity + /// Version + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Mark(int markerIdx, long version) + { + Debug.Assert(markerIdx < 6); + (*(tableAligned + Metadata.threadEntryIndex)).markers[markerIdx] = version; + } + + /// + /// Check if all active threads have completed the some + /// activity until given version. + /// + /// ID of activity + /// Version + /// Whether complete + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool CheckIsComplete(int markerIdx, long version) + { + Debug.Assert(markerIdx < 6); + + // check if all threads have reported complete + for (int index = 1; index <= kTableSize; ++index) + { + long entry_epoch = (*(tableAligned + index)).localCurrentEpoch; + long fc_version = (*(tableAligned + index)).markers[markerIdx]; + if (0 != entry_epoch) + { + if ((fc_version != version) && (entry_epoch < long.MaxValue)) + { + return false; + } + } + } + return true; + } + + /// + /// Looks at all threads and return the latest safe epoch + /// + /// Current epoch + /// Safe epoch + long ComputeNewSafeToReclaimEpoch(long currentEpoch) + { + long oldestOngoingCall = currentEpoch; + + for (int index = 1; index <= kTableSize; ++index) + { + long entry_epoch = (*(tableAligned + index)).localCurrentEpoch; + if (0 != entry_epoch) + { + if (entry_epoch < oldestOngoingCall) + { + oldestOngoingCall = entry_epoch; + } + } + } + + // The latest safe epoch is the one just before the earliest unsafe epoch. + SafeToReclaimEpoch = oldestOngoingCall - 1; + return SafeToReclaimEpoch; + } + + /// + /// Take care of pending drains after epoch suspend + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void SuspendDrain() + { + while (drainCount > 0) + { + // Barrier ensures we see the latest epoch table entries. Ensures + // that the last suspended thread drains all pending actions. + Thread.MemoryBarrier(); + for (int index = 1; index <= kTableSize; ++index) + { + long entry_epoch = (*(tableAligned + index)).localCurrentEpoch; + if (0 != entry_epoch) + { + return; + } + } + Resume(); + Release(); + } + } + + /// + /// Check and invoke trigger actions that are ready + /// + /// Next epoch + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void Drain(long nextEpoch) + { + ComputeNewSafeToReclaimEpoch(nextEpoch); + + for (int i = 0; i < kDrainListSize; i++) + { + var trigger_epoch = drainList[i].epoch; + + if (trigger_epoch <= SafeToReclaimEpoch) + { + if (Interlocked.CompareExchange(ref drainList[i].epoch, long.MaxValue - 1, trigger_epoch) == trigger_epoch) + { + // Store off the trigger action, then set epoch to int.MaxValue to mark this slot as "available for use". + var trigger_action = drainList[i].action; + drainList[i].action = null; + drainList[i].epoch = long.MaxValue; + Interlocked.Decrement(ref drainCount); + + // Execute the action + trigger_action(); + if (drainCount == 0) break; + } + } + } + } + + /// + /// Thread acquires its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void Acquire() + { + if (Metadata.threadEntryIndex == kInvalidIndex) + Metadata.threadEntryIndex = ReserveEntryForThread(); + + Debug.Assert((*(tableAligned + Metadata.threadEntryIndex)).localCurrentEpoch == 0, + "Trying to acquire protected epoch. Make sure you do not re-enter FASTER from callbacks or IDevice implementations. If using tasks, use TaskCreationOptions.RunContinuationsAsynchronously."); + + // This corresponds to AnyInstanceProtected(). We do not mark "ThisInstanceProtected" until ProtectAndDrain(). + Metadata.threadEntryIndexCount++; + } + + /// + /// Thread releases its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void Release() + { + int entry = Metadata.threadEntryIndex; + + Debug.Assert((*(tableAligned + entry)).localCurrentEpoch != 0, + "Trying to release unprotected epoch. Make sure you do not re-enter FASTER from callbacks or IDevice implementations. If using tasks, use TaskCreationOptions.RunContinuationsAsynchronously."); + + // Clear "ThisInstanceProtected()" (non-static epoch table) + (*(tableAligned + entry)).localCurrentEpoch = 0; + (*(tableAligned + entry)).threadId = 0; + + // Decrement "AnyInstanceProtected()" (static thread table) + Metadata.threadEntryIndexCount--; + if (Metadata.threadEntryIndexCount == 0) + { + (threadIndexAligned + Metadata.threadEntryIndex)->threadId = 0; + Metadata.threadEntryIndex = kInvalidIndex; + } + } + + /// + /// Reserve entry for thread. This method relies on the fact that no + /// thread will ever have ID 0. + /// + /// Reserved entry + static int ReserveEntry() + { + while (true) + { + // Try to acquire entry + if (0 == (threadIndexAligned + Metadata.startOffset1)->threadId) + { + if (0 == Interlocked.CompareExchange( + ref (threadIndexAligned + Metadata.startOffset1)->threadId, + Metadata.threadId, 0)) + return Metadata.startOffset1; + } + + if (Metadata.startOffset2 > 0) + { + // Try alternate entry + Metadata.startOffset1 = Metadata.startOffset2; + Metadata.startOffset2 = 0; + } + else Metadata.startOffset1++; // Probe next sequential entry + if (Metadata.startOffset1 > kTableSize) + { + Metadata.startOffset1 -= kTableSize; + Thread.Yield(); + } + } + } + + /// + /// A 32-bit murmur3 implementation. + /// + /// + /// + static int Murmur3(int h) + { + uint a = (uint)h; + a ^= a >> 16; + a *= 0x85ebca6b; + a ^= a >> 13; + a *= 0xc2b2ae35; + a ^= a >> 16; + return (int)a; + } + + /// + /// Allocate a new entry in epoch table. This is called + /// once for a thread. + /// + /// Reserved entry + static int ReserveEntryForThread() + { + if (Metadata.threadId == 0) // run once per thread for performance + { + Metadata.threadId = Environment.CurrentManagedThreadId; + uint code = (uint)Murmur3(Metadata.threadId); + Metadata.startOffset1 = (ushort)(1 + (code % kTableSize)); + Metadata.startOffset2 = (ushort)(1 + ((code >> 16) % kTableSize)); + } + return ReserveEntry(); + } + + /// + /// Epoch table entry (cache line size). + /// + [StructLayout(LayoutKind.Explicit, Size = kCacheLineBytes)] + struct Entry + { + /// + /// Thread-local value of epoch + /// + [FieldOffset(0)] + public long localCurrentEpoch; + + /// + /// ID of thread associated with this entry. + /// + [FieldOffset(8)] + public int threadId; + + [FieldOffset(12)] + public int reentrant; + + [FieldOffset(16)] + public fixed long markers[6]; + + public override string ToString() => $"lce = {localCurrentEpoch}, tid = {threadId}, re-ent {reentrant}"; + } + struct EpochActionPair + { + public long epoch; + public Action action; + + public override string ToString() => $"epoch = {epoch}, action = {(action is null ? "n/a" : action.Method.ToString())}"; + } + } +} \ No newline at end of file diff --git a/cs/src/core/Epochs/RwLatchVersionScheme.cs b/cs/src/core/Epochs/RwLatchVersionScheme.cs new file mode 100644 index 000000000..ac49e0271 --- /dev/null +++ b/cs/src/core/Epochs/RwLatchVersionScheme.cs @@ -0,0 +1,128 @@ +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core; + +public class NonThreadBasedReaderWriterLatch +{ + private volatile int readerCount = 0; + private volatile int writerPending = 0; + + public bool HasActiveThreads() => readerCount != 0 || writerPending != 0; + + public void EnterReadLock() + { + while (true) + { + while (writerPending != 0) Thread.Yield(); + Interlocked.Increment(ref readerCount); + if (writerPending == 0) return; + Interlocked.Decrement(ref readerCount); + } + } + + public void ExitReadLock() + { + var ret = Interlocked.Decrement(ref readerCount); + Debug.Assert(ret >= 0); + } + + public void EnterWriteLock() + { + while (Interlocked.CompareExchange(ref writerPending, 1, 0) != 0) Thread.Yield(); + while (readerCount != 0) Thread.Yield(); + } + + public void ExitWriteLock() + { + writerPending = 0; + } +} + +public class RwLatchVersionScheme : VersionSchemeBase +{ + private NonThreadBasedReaderWriterLatch rwLatch = new(); + + public override void SignalStepAvailable(LightEpoch.EpochContext context = null) + { + if (!rwLatch.HasActiveThreads()) + TryStepStateMachine(); + } + + // Atomic transition from expectedState -> nextState + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool MakeTransition(VersionSchemeState expectedState, VersionSchemeState nextState) + { + if (Interlocked.CompareExchange(ref state.Word, nextState.Word, expectedState.Word) != expectedState.Word) + return false; + Debug.WriteLine("Moved to {0}, {1}", nextState.Phase, nextState.Version); + return true; + } + + protected override void TryStepStateMachine(VersionSchemeStateMachine expectedMachine = null, LightEpoch.EpochContext context = null) + { + var machineLocal = currentMachine; + var oldState = state; + + // Nothing to step + if (machineLocal == null) return; + + // Should exit to avoid stepping infinitely (until stack overflow) + if (expectedMachine != null && machineLocal != expectedMachine) return; + + // Still computing actual to version + if (machineLocal.actualToVersion == -1) return; + + // Machine finished, but not reset yet. Should reset and avoid starting another cycle + if (oldState.Phase == VersionSchemeState.REST && oldState.Version == machineLocal.actualToVersion) + { + Interlocked.CompareExchange(ref currentMachine, null, machineLocal); + return; + } + + // Step is in progress or no step is available + if (oldState.IsIntermediate() || !machineLocal.GetNextStep(oldState, out var nextState)) return; + + var intermediate = VersionSchemeState.MakeIntermediate(oldState); + if (!MakeTransition(oldState, intermediate)) return; + + rwLatch.EnterWriteLock(); + machineLocal.OnEnteringState(oldState, nextState); + var success = MakeTransition(VersionSchemeState.MakeIntermediate(oldState), nextState); + Debug.Assert(success); + machineLocal.AfterEnteringState(nextState); + rwLatch.ExitWriteLock(); + TryStepStateMachine(machineLocal); + } + + public override VersionSchemeState Enter(LightEpoch.EpochContext context = null) + { + TryStepStateMachine(); + rwLatch.EnterReadLock(); + + VersionSchemeState result; + while (true) + { + result = state; + if (!result.IsIntermediate()) break; + rwLatch.ExitReadLock(); + Thread.Yield(); + rwLatch.EnterReadLock(); + } + + return result; + } + + public override VersionSchemeState Refresh(LightEpoch.EpochContext context = null) + { + Leave(); + return Enter(); + } + + public override void Leave(LightEpoch.EpochContext context = null) + { + rwLatch.ExitReadLock(); + TryStepStateMachine(); + } +} \ No newline at end of file diff --git a/cs/src/core/FasterLog/FasterLog.cs b/cs/src/core/FasterLog/FasterLog.cs index 82f444149..3883cd979 100644 --- a/cs/src/core/FasterLog/FasterLog.cs +++ b/cs/src/core/FasterLog/FasterLog.cs @@ -44,7 +44,6 @@ public sealed class FasterLog : IDisposable long commitNum, commitCoveredAddress; readonly LogCommitPolicy commitPolicy; - /// /// Beginning address of log /// @@ -326,13 +325,11 @@ public void CompleteLog(bool spinWait = false) bool isProtected = epoch.ThisInstanceProtected(); if (!isProtected) epoch.Resume(); + var otherEntriesDone = new ManualResetEventSlim(); try { // Ensure all currently started entries will enqueue before we declare log closed - epoch.BumpCurrentEpoch(() => - { - CommitInternal(out _, out _, false, Array.Empty(), long.MaxValue, null); - }); + epoch.BumpCurrentEpoch(() => {}); } finally { @@ -340,6 +337,7 @@ public void CompleteLog(bool spinWait = false) epoch.Suspend(); } + CommitInternal(out _, out _, false, Array.Empty(), long.MaxValue, null); if (spinWait) WaitForCommit(TailAddress, long.MaxValue); } @@ -450,33 +448,71 @@ public long Enqueue(IReadOnlySpanBatch readOnlySpanBatch) return logicalAddress; } - /// - /// Enqueue batch of entries to log (in memory) - no guarantee of flush/commit - /// - /// Entry to be enqueued to log - /// type of entry - /// Logical address of added entry - public long Enqueue(T entry) where T : ILogEnqueueEntry + public unsafe bool TryEnqueue(IReadOnlySpanBatch readOnlySpanBatch, Action criticalSection) { - long logicalAddress; - while (!TryEnqueue(entry, out logicalAddress)) + int totalEntries = readOnlySpanBatch.TotalEntries(); + var allocatedLength = 0; + for (int i = 0; i < totalEntries; i++) + { + allocatedLength += Align(readOnlySpanBatch.Get(i).Length) + headerSize; + } + + ValidateAllocatedLength(allocatedLength); + + epoch.Resume(); + if (commitNum == long.MaxValue) throw new FasterException("Attempting to enqueue into a completed log"); + + var logicalAddress = allocator.TryAllocateRetryNow(allocatedLength); + + if (logicalAddress == 0) + { + epoch.Suspend(); + if (cannedException != null) throw cannedException; + return false; + } + + var physicalAddress = allocator.GetPhysicalAddress(logicalAddress); + var currentLogicalAddress = logicalAddress; + for (int i = 0; i < totalEntries; i++) + { + var span = readOnlySpanBatch.Get(i); + var entryLength = span.Length; + fixed (byte* bp = &span.GetPinnableReference()) + Buffer.MemoryCopy(bp, (void*) (headerSize + physicalAddress), entryLength, entryLength); + SetHeader(entryLength, (byte*) physicalAddress); + var usedSpace = Align(entryLength) + headerSize; + + criticalSection?.Invoke(readOnlySpanBatch, i, currentLogicalAddress); + currentLogicalAddress += usedSpace; + physicalAddress += usedSpace; + } + + if (AutoRefreshSafeTailAddress) DoAutoRefreshSafeTailAddress(); + + epoch.Suspend(); + return true; + } + + public void Enqueue(IReadOnlySpanBatch batch, Action criticalSection) + { + while (!TryEnqueue(batch, criticalSection)) Thread.Yield(); - return logicalAddress; } - + /// /// Enqueue batch of entries to log (in memory) - no guarantee of flush/commit /// - /// Batch of entries to be enqueued to log + /// Entry to be enqueued to log /// type of entry /// Logical address of added entry - public long Enqueue(IEnumerable entries) where T : ILogEnqueueEntry + public long Enqueue(T entry, Action criticalSection) where T : ILogEnqueueEntry { long logicalAddress; - while (!TryEnqueue(entries, out logicalAddress)) + while (!TryEnqueue(entry, out logicalAddress, criticalSection)) Thread.Yield(); return logicalAddress; } + #endregion #region TryEnqueue @@ -488,7 +524,7 @@ public long Enqueue(IEnumerable entries) where T : ILogEnqueueEntry /// Logical address of added entry /// type of entry /// Whether the append succeeded - public unsafe bool TryEnqueue(T entry, out long logicalAddress) where T : ILogEnqueueEntry + public unsafe bool TryEnqueue(T entry, out long logicalAddress, Action criticalSection) where T : ILogEnqueueEntry { logicalAddress = 0; var length = entry.SerializedLength; @@ -511,6 +547,7 @@ public unsafe bool TryEnqueue(T entry, out long logicalAddress) where T : ILo var physicalAddress = allocator.GetPhysicalAddress(logicalAddress); entry.SerializeTo(new Span((void*)(headerSize + physicalAddress), length)); SetHeader(length, (byte*)physicalAddress); + criticalSection?.Invoke(entry, logicalAddress); if (AutoRefreshSafeTailAddress) DoAutoRefreshSafeTailAddress(); epoch.Suspend(); if (AutoCommit) Commit(); @@ -1087,7 +1124,7 @@ private static async ValueTask SlowEnqueueAsync(FasterLog @this, IReadOnly public ValueTask EnqueueAsync(T entry, CancellationToken token = default) where T : ILogEnqueueEntry { token.ThrowIfCancellationRequested(); - if (TryEnqueue(entry, out long logicalAddress)) + if (TryEnqueue(entry, out long logicalAddress, null)) return new ValueTask(logicalAddress); return SlowEnqueueAsync(this, entry, token); @@ -1100,7 +1137,7 @@ private static async ValueTask SlowEnqueueAsync(FasterLog @this, T entr while (true) { var flushEvent = @this.FlushEvent; - if (@this.TryEnqueue(entry, out logicalAddress)) + if (@this.TryEnqueue(entry, out logicalAddress, null)) break; // Wait for *some* flush - failure can be ignored except if the token was signaled (which the caller should handle correctly) try @@ -1455,7 +1492,7 @@ public long EnqueueAndWaitForCommit(IReadOnlySpanBatch readOnlySpanBatch) public long EnqueueAndWaitForCommit(T entry) where T : ILogEnqueueEntry { long logicalAddress; - while (!TryEnqueue(entry, out logicalAddress)) + while (!TryEnqueue(entry, out logicalAddress, null)) Thread.Yield(); WaitForCommit(logicalAddress + 1); return logicalAddress; @@ -1657,7 +1694,7 @@ public async ValueTask EnqueueAndWaitForCommitAsync(T entry, Cancellati { flushEvent = FlushEvent; commitTask = CommitTask; - if (TryEnqueue(entry, out logicalAddress)) + if (TryEnqueue(entry, out logicalAddress, null)) break; try { diff --git a/cs/src/core/Index/CheckpointManagement/FileBasedCheckpointManager.cs b/cs/src/core/Index/CheckpointManagement/FileBasedCheckpointManager.cs new file mode 100644 index 000000000..4d6f6f440 --- /dev/null +++ b/cs/src/core/Index/CheckpointManagement/FileBasedCheckpointManager.cs @@ -0,0 +1,418 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.Extensions.Logging; + +namespace FASTER.core; + +public class FileBasedCheckpointManager : ILogCommitManager, ICheckpointManager +{ + const byte indexTokenCount = 2; + const byte logTokenCount = 1; + const byte flogCommitCount = 1; + + protected readonly ICheckpointNamingScheme checkpointNamingScheme; + protected readonly INamedDeviceFactory deviceFactory; + private readonly bool removeOutdated; + + /// + /// Track historical commits for automatic purging + /// + private readonly Guid[] indexTokenHistory, logTokenHistory; + + private readonly long[] flogCommitHistory; + private byte indexTokenHistoryOffset, logTokenHistoryOffset, flogCommitHistoryOffset; + + readonly ILogger logger; + readonly WorkQueueFIFO deleteQueue; + readonly int fastCommitThrottleFreq; + int commitCount; + + public FileBasedCheckpointManager(INamedDeviceFactory deviceFactory, ICheckpointNamingScheme checkpointNamingScheme, bool removeOutdated = true, + int fastCommitThrottleFreq = 0, ILogger logger = null) + { + this.logger = logger; + this.checkpointNamingScheme = checkpointNamingScheme; + this.fastCommitThrottleFreq = fastCommitThrottleFreq; + this.deviceFactory = deviceFactory; + + this.removeOutdated = removeOutdated; + if (removeOutdated) + { + deleteQueue = new WorkQueueFIFO(prior => DeleteIfExists(checkpointNamingScheme.FasterLogCommitMetadata(prior))); + + // We keep two index checkpoints as the latest index might not have a + // later log checkpoint to work with + indexTokenHistory = new Guid[indexTokenCount]; + // We only keep the latest log checkpoint + logTokenHistory = new Guid[logTokenCount]; + // // We only keep the latest FasterLog commit + flogCommitHistory = new long[flogCommitCount]; + } + } + + private void DeleteIfExists(FileDescriptor descriptor) + { + if (descriptor.fileName != null) + { + var file = new FileInfo(Path.Combine(this.checkpointNamingScheme.BaseName(), descriptor.directoryName, + descriptor.fileName)); + if (file.Exists) file.Delete(); + } + else + { + var dir = new DirectoryInfo(Path.Combine(this.checkpointNamingScheme.BaseName(), descriptor.directoryName)); + if (dir.Exists) dir.Delete(true); + + } + } + + private FileStream GetFile(FileDescriptor fd) + { + var filename = Path.Combine(checkpointNamingScheme.BaseName(), fd.directoryName, fd.fileName); + var path = new FileInfo(filename).Directory.FullName; + if (!Directory.Exists(path)) Directory.CreateDirectory(path); + return new FileStream(filename, FileMode.OpenOrCreate, FileAccess.ReadWrite); + } + + private byte[] ReadFileBodyFully(string path) + { + using var fs = new FileStream(path, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(fs); + var size = reader.ReadInt32(); + var result = new byte[size]; + for (var read = 0; read < size; ) + { + var bytesReceived = reader.Read(result, sizeof(int) + read, size - read); + if (bytesReceived == 0) throw new IOException("unexpected end of file"); + read += bytesReceived; + } + + return result; + } + + + public void PurgeAll() + { + var dir = new DirectoryInfo(this.checkpointNamingScheme.BaseName()); + if (dir.Exists) dir.Delete(true); + } + + /// + public void Purge(Guid token) + { + DeleteIfExists(checkpointNamingScheme.LogCheckpointBase(token)); + DeleteIfExists(checkpointNamingScheme.IndexCheckpointBase(token)); + } + + #region ILogCommitManager + + /// + public unsafe void Commit(long beginAddress, long untilAddress, byte[] commitMetadata, long commitNum, + bool forceWriteMetadata) + { + if (!forceWriteMetadata && fastCommitThrottleFreq > 0 && (commitCount++ % fastCommitThrottleFreq != 0)) return; + + using var fileStream = GetFile(checkpointNamingScheme.FasterLogCommitMetadata(commitNum)); + + // Two phase to ensure we write metadata in single Write operation + using var writer = new BinaryWriter(fileStream); + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + writer.Flush(); + + if (removeOutdated) + { + var prior = flogCommitHistory[flogCommitHistoryOffset]; + flogCommitHistory[flogCommitHistoryOffset] = commitNum; + flogCommitHistoryOffset = (byte)((flogCommitHistoryOffset + 1) % flogCommitCount); + if (prior != default) + { + // System.Threading.Tasks.Task.Run(() => deviceFactory.Delete(checkpointNamingScheme.FasterLogCommitMetadata(prior))); + deleteQueue.EnqueueAndTryWork(prior, true); + } + } + } + + /// + public void Dispose() + { + GC.SuppressFinalize(this); + } + + private IEnumerable ListContents(string path) + { + var pathInfo = new DirectoryInfo(Path.Combine(checkpointNamingScheme.BaseName(), path)); + if (!pathInfo.Exists) yield break; + + foreach (var folder in pathInfo.GetDirectories().OrderByDescending(f => f.LastWriteTime)) + { + yield return new FileDescriptor(folder.Name, ""); + } + + foreach (var file in pathInfo.GetFiles().OrderByDescending(f => f.LastWriteTime)) + { + yield return new FileDescriptor("", file.Name); + } + } + /// + public IEnumerable ListCommits() + { + return ListContents(checkpointNamingScheme.FasterLogCommitBasePath()) + .Select(e => checkpointNamingScheme.CommitNumber(e)).OrderByDescending(e => e); + } + + /// + public void RemoveCommit(long commitNum) + { + DeleteIfExists(checkpointNamingScheme.FasterLogCommitMetadata(commitNum)); + } + + /// + public void RemoveAllCommits() + { + foreach (var commitNum in ListCommits()) + RemoveCommit(commitNum); + } + + + + /// + public byte[] GetCommitMetadata(long commitNum) + { + var fd = checkpointNamingScheme.FasterLogCommitMetadata(commitNum); + return ReadFileBodyFully(Path.Combine(checkpointNamingScheme.BaseName(), fd.directoryName, fd.fileName)); + } + + #endregion + + #region ICheckpointManager + + /// + public unsafe void CommitIndexCheckpoint(Guid indexToken, byte[] commitMetadata) + { + using var file = GetFile(checkpointNamingScheme.IndexCheckpointMetadata(indexToken)); + // Two phase to ensure we write metadata in single Write operation + using var writer = new BinaryWriter(file); + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + writer.Flush(); + + if (removeOutdated) + { + var prior = indexTokenHistory[indexTokenHistoryOffset]; + indexTokenHistory[indexTokenHistoryOffset] = indexToken; + indexTokenHistoryOffset = (byte)((indexTokenHistoryOffset + 1) % indexTokenCount); + if (prior != default) + DeleteIfExists(checkpointNamingScheme.IndexCheckpointBase(prior)); + } + } + + /// + public IEnumerable GetIndexCheckpointTokens() + { + return ListContents(checkpointNamingScheme.IndexCheckpointBasePath()) + .Select(e => checkpointNamingScheme.Token(e)); + } + + /// + public byte[] GetIndexCheckpointMetadata(Guid indexToken) + { + var fd = checkpointNamingScheme.IndexCheckpointMetadata(indexToken); + return ReadFileBodyFully(Path.Combine(checkpointNamingScheme.BaseName(), fd.directoryName, fd.fileName)); + } + + /// + public virtual unsafe void CommitLogCheckpoint(Guid logToken, byte[] commitMetadata) + { + + using var file = GetFile(checkpointNamingScheme.LogCheckpointMetadata(logToken)); + // Two phase to ensure we write metadata in single Write operation + using var writer = new BinaryWriter(file); + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + writer.Flush(); + + if (removeOutdated) + { + var prior = logTokenHistory[logTokenHistoryOffset]; + logTokenHistory[logTokenHistoryOffset] = logToken; + logTokenHistoryOffset = (byte)((logTokenHistoryOffset + 1) % logTokenCount); + if (prior != default) + DeleteIfExists(checkpointNamingScheme.LogCheckpointBase(prior)); + } + } + + /// + public virtual unsafe void CommitLogIncrementalCheckpoint(Guid logToken, long version, byte[] commitMetadata, + DeltaLog deltaLog) + { + deltaLog.Allocate(out int length, out long physicalAddress); + if (length < commitMetadata.Length) + { + deltaLog.Seal(0, DeltaLogEntryType.CHECKPOINT_METADATA); + deltaLog.Allocate(out length, out physicalAddress); + if (length < commitMetadata.Length) + { + deltaLog.Seal(0); + throw new Exception( + $"Metadata of size {commitMetadata.Length} does not fit in delta log space of size {length}"); + } + } + + fixed (byte* ptr = commitMetadata) + { + Buffer.MemoryCopy(ptr, (void*)physicalAddress, commitMetadata.Length, commitMetadata.Length); + } + + deltaLog.Seal(commitMetadata.Length, DeltaLogEntryType.CHECKPOINT_METADATA); + deltaLog.FlushAsync().Wait(); + } + + /// + public IEnumerable GetLogCheckpointTokens() + { + return ListContents(checkpointNamingScheme.LogCheckpointBasePath()) + .Select(e => checkpointNamingScheme.Token(e)); + } + + /// + public virtual byte[] GetLogCheckpointMetadata(Guid logToken, DeltaLog deltaLog, bool scanDelta, long recoverTo) + { + byte[] metadata = null; + if (deltaLog != null && scanDelta) + { + // Try to get latest valid metadata from delta-log + deltaLog.Reset(); + while (deltaLog.GetNext(out long physicalAddress, out int entryLength, out var type)) + { + switch (type) + { + case DeltaLogEntryType.DELTA: + // consider only metadata records + continue; + case DeltaLogEntryType.CHECKPOINT_METADATA: + metadata = new byte[entryLength]; + unsafe + { + fixed (byte* m = metadata) + Buffer.MemoryCopy((void*)physicalAddress, m, entryLength, entryLength); + } + + HybridLogRecoveryInfo recoveryInfo = new(); + using (StreamReader s = new(new MemoryStream(metadata))) + { + recoveryInfo.Initialize(s); + // Finish recovery if only specific versions are requested + if (recoveryInfo.version == recoverTo || recoveryInfo.version < recoverTo && + recoveryInfo.nextVersion > recoverTo) goto LoopEnd; + } + + continue; + default: + throw new FasterException("Unexpected entry type"); + } + + LoopEnd: + break; + } + + if (metadata != null) return metadata; + } + + var fd = checkpointNamingScheme.LogCheckpointMetadata(logToken); + return ReadFileBodyFully(Path.Combine(checkpointNamingScheme.BaseName(), fd.directoryName, fd.fileName)); + } + + /// + public IDevice GetIndexDevice(Guid indexToken) + { + return deviceFactory.Get(checkpointNamingScheme.HashTable(indexToken)); + } + + /// + public IDevice GetSnapshotLogDevice(Guid token) + { + return deviceFactory.Get(checkpointNamingScheme.LogSnapshot(token)); + } + + /// + public IDevice GetSnapshotObjectLogDevice(Guid token) + { + return deviceFactory.Get(checkpointNamingScheme.ObjectLogSnapshot(token)); + } + + /// + public IDevice GetDeltaLogDevice(Guid token) + { + return deviceFactory.Get(checkpointNamingScheme.DeltaLog(token)); + } + + /// + public void InitializeIndexCheckpoint(Guid indexToken) + { + } + + /// + public void InitializeLogCheckpoint(Guid logToken) + { + } + + /// + public void OnRecovery(Guid indexToken, Guid logToken) + { + if (!removeOutdated) return; + + // Add recovered tokens to history, for eventual purging + if (indexToken != default) + { + indexTokenHistory[indexTokenHistoryOffset] = indexToken; + indexTokenHistoryOffset = (byte)((indexTokenHistoryOffset + 1) % indexTokenCount); + } + + if (logToken != default) + { + logTokenHistory[logTokenHistoryOffset] = logToken; + logTokenHistoryOffset = (byte)((logTokenHistoryOffset + 1) % logTokenCount); + } + + // Purge all log checkpoints that were not used for recovery + foreach (var recoveredLogToken in GetLogCheckpointTokens()) + { + if (recoveredLogToken != logToken) + DeleteIfExists(checkpointNamingScheme.LogCheckpointBase(recoveredLogToken)); + } + + // Purge all index checkpoints that were not used for recovery + foreach (var recoveredIndexToken in GetIndexCheckpointTokens()) + { + if (recoveredIndexToken != indexToken) + DeleteIfExists(checkpointNamingScheme.IndexCheckpointBase(recoveredIndexToken)); + } + } + + /// + public void OnRecovery(long commitNum) + { + if (!removeOutdated) return; + + foreach (var recoveredCommitNum in ListCommits()) + if (recoveredCommitNum != commitNum) + RemoveCommit(recoveredCommitNum); + + // Add recovered tokens to history, for eventual purging + if (commitNum != default) + { + flogCommitHistory[flogCommitHistoryOffset] = commitNum; + flogCommitHistoryOffset = (byte)((flogCommitHistoryOffset + 1) % flogCommitCount); + } + } + #endregion + + + /// + public virtual void CheckpointVersionShift(long oldVersion, long newVersion) + { + } +} \ No newline at end of file diff --git a/cs/src/core/Index/FASTER/FASTER.cs b/cs/src/core/Index/FASTER/FASTER.cs index 849ed36dc..d837df1f2 100644 --- a/cs/src/core/Index/FASTER/FASTER.cs +++ b/cs/src/core/Index/FASTER/FASTER.cs @@ -387,6 +387,39 @@ public bool TryInitiateHybridLogCheckpoint(out Guid token, CheckpointType checkp return result; } + + public bool TryTakeDprStyleCheckpoint(long version, ReadOnlySpan metadata, Action onPersist, out Guid token) + { + CommitCookie = metadata.ToArray(); + var backend = new FoldOverCheckpointTask(onPersist); + var stateMachine = new HybridLogCheckpointStateMachine(backend, version); + var success = StartStateMachine(stateMachine); + token = _hybridLogCheckpointToken; + if (!success) return false; + Debug.Assert(!epoch.ThisInstanceProtected()); + while (true) + { + try + { + epoch.Resume(); + var systemState = this.systemState; + if (systemState.Version == stateMachine.ToVersion()) + return true; + ThreadStateMachineStep(null, NullFasterSession.Instance, null); + } + catch (Exception) + { + this._indexCheckpoint.Reset(); + this._hybridLogCheckpoint.Dispose(); + throw; + } + finally + { + epoch.Suspend(); + } + } + } + /// /// Take log-only checkpoint /// diff --git a/cs/src/core/Index/Recovery/ICheckpointManager.cs b/cs/src/core/Index/Recovery/ICheckpointManager.cs index e9b5cc9f7..113037f20 100644 --- a/cs/src/core/Index/Recovery/ICheckpointManager.cs +++ b/cs/src/core/Index/Recovery/ICheckpointManager.cs @@ -142,7 +142,7 @@ public interface ICheckpointManager : IDisposable public void PurgeAll(); /// - /// Initiatize manager on recovery (e.g., deleting other checkpoints) + /// Initialize manager on recovery (e.g., deleting other checkpoints) /// /// /// diff --git a/cs/src/core/Index/Synchronization/HybridLogCheckpointTask.cs b/cs/src/core/Index/Synchronization/HybridLogCheckpointTask.cs index ae7ae2792..805f699bf 100644 --- a/cs/src/core/Index/Synchronization/HybridLogCheckpointTask.cs +++ b/cs/src/core/Index/Synchronization/HybridLogCheckpointTask.cs @@ -16,6 +16,8 @@ namespace FASTER.core internal abstract class HybridLogCheckpointOrchestrationTask : ISynchronizationTask { private long lastVersion; + protected Action onPersist; + /// public virtual void GlobalBeforeEnteringState(SystemState next, FasterKV faster) @@ -102,6 +104,12 @@ protected static void CollectMetadata(SystemState next, FasterKV(SystemState next, FasterKV faster) { + switch (next.Phase) + { + case Phase.REST: + onPersist?.Invoke(); + break; + } } /// @@ -138,6 +146,11 @@ public virtual void OnThreadState internal sealed class FoldOverCheckpointTask : HybridLogCheckpointOrchestrationTask { + internal FoldOverCheckpointTask(Action onPersist = null) + { + this.onPersist = onPersist; + } + /// public override void GlobalBeforeEnteringState(SystemState next, FasterKV faster) diff --git a/cs/src/core/Utilities/Utility.cs b/cs/src/core/Utilities/Utility.cs index ab510a1de..6a4cdc551 100644 --- a/cs/src/core/Utilities/Utility.cs +++ b/cs/src/core/Utilities/Utility.cs @@ -112,7 +112,7 @@ internal static long PreviousPowerOf2(long v) /// /// /// - internal static string PrettySize(long value) + public static string PrettySize(long value) { char[] suffix = new char[] { 'K', 'M', 'G', 'T', 'P' }; double v = value; diff --git a/cs/src/devices/AzureStorageDevice/AzureStorageDevice.cs b/cs/src/devices/AzureStorageDevice/AzureStorageDevice.cs index 13b99025f..b6ea80f34 100644 --- a/cs/src/devices/AzureStorageDevice/AzureStorageDevice.cs +++ b/cs/src/devices/AzureStorageDevice/AzureStorageDevice.cs @@ -328,7 +328,7 @@ public void PurgeAll() { foreach (var entry in blobs) { - entry.Value.PageBlob.Default?.Delete(); + entry.Value.PageBlob.Default?.DeleteIfExists(); } }