Untitled

                Never    
C#
       
using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.CPU;
using ILGPU.Runtime.Cuda;
using System;
using System.Linq;
using System.Diagnostics;

namespace VVatashi.Cryptography
{
  public static class Program
  {
    private struct RoundKeys
    {
      public ulong Key0;
      public ulong Key1;
      public ulong Key2;
      public ulong Key3;
      public ulong Key4;
      public ulong Key5;
      public ulong Key6;
      public ulong Key7;
      public ulong Key8;
      public ulong Key9;
      public ulong Key10;
      public ulong Key11;
      public ulong Key12;
      public ulong Key13;
      public ulong Key14;
      public ulong Key15;
    }

    private static byte[] PC1 = new byte[] {
      56, 48, 40, 32, 24, 16,  8,  0,
      57, 49, 41, 33, 25, 17,  9,  1,
      58, 50, 42, 34, 26, 18, 10,  2,
      59, 51, 43, 35, 62, 54, 46, 38,
      30, 22, 14,  6, 61, 53, 45, 37,
      29, 21, 13,  5, 60, 52, 44, 36,
      28, 20, 12,  4, 27, 19, 11,  3,
    };

    private static byte[] PC2 = new byte[] {
      13, 16, 10, 23,  0,  4,  2, 27,
      14,  5, 20,  9, 22, 18, 11,  3,
      25,  7, 15,  6, 26, 19, 12,  1,
      40, 51, 30, 36, 46, 54, 29, 39,
      50, 44, 32, 47, 43, 48, 38, 55,
      33, 52, 45, 41, 49, 35, 28, 31,
    };

    private static byte[] IP = new byte[] {
      57, 49, 41, 33, 25, 17,  9, 1,
      59, 51, 43, 35, 27, 19, 11, 3,
      61, 53, 45, 37, 29, 21, 13, 5,
      63, 55, 47, 39, 31, 23, 15, 7,
      56, 48, 40, 32, 24, 16,  8, 0,
      58, 50, 42, 34, 26, 18, 10, 2,
      60, 52, 44, 36, 28, 20, 12, 4,
      62, 54, 46, 38, 30, 22, 14, 6,
    };

    private static byte[] IIP = new byte[] {
      39, 7, 47, 15, 55, 23, 63, 31,
      38, 6, 46, 14, 54, 22, 62, 30,
      37, 5, 45, 13, 53, 21, 61, 29,
      36, 4, 44, 12, 52, 20, 60, 28,
      35, 3, 43, 11, 51, 19, 59, 27,
      34, 2, 42, 10, 50, 18, 58, 26,
      33, 1, 41,  9, 49, 17, 57, 25,
      32, 0, 40,  8, 48, 16, 56, 24,
    };

    private static byte[] E = new byte[] {
      31,  0,  1,  2,  3,  4,
       3,  4,  5,  6,  7,  8,
       7,  8,  9, 10, 11, 12,
      11, 12, 13, 14, 15, 16,
      15, 16, 17, 18, 19, 20,
      19, 20, 21, 22, 23, 24,
      23, 24, 25, 26, 27, 28,
      27, 28, 29, 30, 31,  0,
    };

    private static byte[] P = new byte[] {
      15,  6, 19, 20, 28, 11, 27, 16,
       0, 14, 22, 25,  4, 17, 30,  9,
       1,  7, 23, 13, 31, 26,  2,  8,
      18, 12, 29,  5, 21, 10,  3, 24,
    };

    private static byte[] S = new byte[] {
      // S0
      14,  0,  4, 15, 13,  7,  1,  4,  2, 14, 15, 2, 11, 13,  8,  1,
       3, 10, 10,  6,  6, 12, 12, 11,  5,  9,  9, 5,  0,  3,  7,  8,
       4, 15,  1, 12, 14,  8,  8,  2, 13,  4,  6, 9,  2,  1, 11,  7,
      15,  5, 12, 11,  9,  3,  7, 14,  3, 10, 10, 0,  5,  6,  0, 13,
      // S1
      15,  3,  1, 13,  8,  4, 14,  7,  6, 15, 11,  2,  3,  8,  4, 14,
       9, 12,  7,  0,  2,  1, 13, 10, 12,  6,  0,  9,  5, 11, 10,  5,
       0, 13, 14,  8,  7, 10, 11,  1, 10,  3,  4, 15, 13,  4,  1,  2,
       5, 11,  8,  6, 12,  7,  6, 12,  9,  0,  3,  5,  2, 14, 15,  9,
      // S2
      10, 13,  0,  7,  9,  0, 14,  9,  6,  3,  3,  4, 15,  6, 5, 10,
       1,  2, 13,  8, 12,  5,  7, 14, 11, 12,  4, 11,  2, 15, 8,  1,
      13,  1,  6, 10,  4, 13,  9,  0,  8,  6, 15,  9,  3,  8, 0,  7,
      11,  4,  1, 15,  2, 14, 12,  3,  5, 11, 10,  5, 14,  2, 7, 12,
      // S3
       7, 13, 13,  8, 14, 11,  3,  5,  0,  6,  6, 15, 9,  0, 10,  3,
       1,  4,  2,  7,  8,  2,  5, 12, 11,  1, 12, 10, 4, 14, 15,  9,
      10,  3,  6, 15,  9,  0,  0,  6, 12, 10, 11,  1, 7, 13, 13,  8,
      15,  9,  1,  4,  3,  5, 14, 11,  5, 12,  2,  7, 8,  2,  4, 14,
      // S4
       2, 14, 12, 11,  4,  2,  1, 12,  7,  4, 10,  7, 11, 13,  6,  1,
       8,  5,  5,  0,  3, 15, 15, 10, 13,  3,  0,  9, 14,  8,  9,  6,
       4, 11,  2,  8,  1, 12, 11,  7, 10,  1, 13, 14,  7,  2,  8, 13,
      15,  6,  9, 15, 12,  0,  5,  9,  6, 10,  3,  4,  0,  5, 14,  3,
      // S5
      12, 10,  1, 15, 10,  4, 15,  2,  9, 7,  2, 12,  6,  9,  8,  5,
       0,  6, 13,  1,  3, 13,  4, 14, 14, 0,  7, 11,  5,  3, 11,  8,
       9,  4, 14,  3, 15,  2,  5, 12,  2, 9,  8,  5, 12, 15,  3, 10,
       7, 11,  0, 14,  4,  1, 10,  7,  1, 6, 13,  0, 11,  8,  6, 13,
      // S6
       4, 13, 11,  0,  2, 11, 14,  7, 15,  4,  0,  9, 8,  1, 13, 10,
       3, 14, 12,  3,  9,  5,  7, 12,  5,  2, 10, 15, 6,  8,  1,  6,
       1,  6,  4, 11, 11, 13, 13,  8, 12,  1,  3,  4, 7, 10, 14,  7,
      10,  9, 15,  5,  6,  0,  8, 15,  0, 14,  5,  2, 9,  3,  2, 12,
      // S7
      13,  1,  2, 15,  8, 13,  4,  8,  6, 10, 15,  3, 11, 7, 1,  4,
      10, 12,  9,  5,  3,  6, 14, 11,  5,  0,  0, 14, 12, 9, 7,  2,
       7,  2, 11,  1,  4, 14,  1,  7,  9,  4, 12, 10, 14, 8, 2, 13,
       0, 15,  6, 12, 10,  9, 13,  0, 15,  3,  3,  5,  5, 6, 8, 11,
    };

    private static ulong Permutate(ulong value, ArrayView<byte> table)
    {
      ulong result = 0;
      for (int i = table.Length - 1; i >= 0; --i) {
        result >>= 1;
        if ((value & (0x8000000000000000UL >> table[i])) != 0) {
          result |= 0x8000000000000000UL;
        }
      }

      return result;
    }

    private static ulong ShiftHalfKey(ulong key, byte count)
    {
      key = (key << count) | (key >> (28 - count));
      return key & 0xFFFFFFF000000000UL;
    }

    private static RoundKeys CreateRoundKeys(ulong key, ArrayView<byte> pc1, ArrayView<byte> pc2)
    {
      key = Permutate(key, pc1);
      ulong c0 = key & 0xFFFFFFF000000000UL;
      ulong d0 = (key << 28) & 0xFFFFFFF000000000UL;
      return new RoundKeys() {
        Key0 = Permutate(ShiftHalfKey(c0, 1) | (ShiftHalfKey(d0, 1) >> 28), pc2),
        Key1 = Permutate(ShiftHalfKey(c0, 2) | (ShiftHalfKey(d0, 2) >> 28), pc2),
        Key2 = Permutate(ShiftHalfKey(c0, 4) | (ShiftHalfKey(d0, 4) >> 28), pc2),
        Key3 = Permutate(ShiftHalfKey(c0, 6) | (ShiftHalfKey(d0, 6) >> 28), pc2),
        Key4 = Permutate(ShiftHalfKey(c0, 8) | (ShiftHalfKey(d0, 8) >> 28), pc2),
        Key5 = Permutate(ShiftHalfKey(c0, 10) | (ShiftHalfKey(d0, 10) >> 28), pc2),
        Key6 = Permutate(ShiftHalfKey(c0, 12) | (ShiftHalfKey(d0, 12) >> 28), pc2),
        Key7 = Permutate(ShiftHalfKey(c0, 14) | (ShiftHalfKey(d0, 14) >> 28), pc2),
        Key8 = Permutate(ShiftHalfKey(c0, 15) | (ShiftHalfKey(d0, 15) >> 28), pc2),
        Key9 = Permutate(ShiftHalfKey(c0, 17) | (ShiftHalfKey(d0, 17) >> 28), pc2),
        Key10 = Permutate(ShiftHalfKey(c0, 19) | (ShiftHalfKey(d0, 19) >> 28), pc2),
        Key11 = Permutate(ShiftHalfKey(c0, 21) | (ShiftHalfKey(d0, 21) >> 28), pc2),
        Key12 = Permutate(ShiftHalfKey(c0, 23) | (ShiftHalfKey(d0, 23) >> 28), pc2),
        Key13 = Permutate(ShiftHalfKey(c0, 25) | (ShiftHalfKey(d0, 25) >> 28), pc2),
        Key14 = Permutate(ShiftHalfKey(c0, 27) | (ShiftHalfKey(d0, 27) >> 28), pc2),
        Key15 = Permutate(c0 | (d0 >> 28), pc2),
      };
    }

    private static ulong Encrypt(
      ulong key,
      ulong value,
      ArrayView<byte> e,
      ArrayView<byte> s,
      ArrayView<byte> p
    ) {
      value = Permutate(value, e) ^ key;

      ulong b0 = (value >> 58) & 0x3FUL;
      ulong b1 = (value >> (58 - 6)) & 0x3FUL;
      ulong b2 = (value >> (58 - 6 * 2)) & 0x3FUL;
      ulong b3 = (value >> (58 - 6 * 3)) & 0x3FUL;
      ulong b4 = (value >> (58 - 6 * 4)) & 0x3FUL;
      ulong b5 = (value >> (58 - 6 * 5)) & 0x3FUL;
      ulong b6 = (value >> (58 - 6 * 6)) & 0x3FUL;
      ulong b7 = (value >> (58 - 6 * 7)) & 0x3FUL;

      ulong sb0 = s[(int)b0];
      ulong sb1 = s[(int)b1 + 64];
      ulong sb2 = s[(int)b2 + 64 * 2];
      ulong sb3 = s[(int)b3 + 64 * 3];
      ulong sb4 = s[(int)b4 + 64 * 4];
      ulong sb5 = s[(int)b5 + 64 * 5];
      ulong sb6 = s[(int)b6 + 64 * 6];
      ulong sb7 = s[(int)b7 + 64 * 7];

      ulong result = (sb7 << 60);
      result = (sb6 << 60) | (result >> 4);
      result = (sb5 << 60) | (result >> 4);
      result = (sb4 << 60) | (result >> 4);
      result = (sb3 << 60) | (result >> 4);
      result = (sb2 << 60) | (result >> 4);
      result = (sb1 << 60) | (result >> 4);
      result = (sb0 << 60) | (result >> 4);

      return Permutate(result, p);
    }

    private static void Kernel(
      Index index,
      ArrayView<byte> pc1,
      ArrayView<byte> pc2,
      ArrayView<byte> ip,
      ArrayView<byte> iip,
      ArrayView<byte> e,
      ArrayView<byte> s,
      ArrayView<byte> p,
      ArrayView<ulong> keys,
      ArrayView<ulong> blocks,
      ArrayView<ulong> output
    ) {
      RoundKeys roundKeys = CreateRoundKeys(keys[index], pc1, pc2);

      ulong block = Permutate(blocks[index], ip);

      ulong a0 = block & 0xFFFFFFFF00000000UL;
      ulong b0 = (block << 32) & 0xFFFFFFFF00000000UL;

      ulong b1 = Encrypt(roundKeys.Key0, b0, e, s, p) ^ a0;
      ulong b2 = Encrypt(roundKeys.Key1, b1, e, s, p) ^ b0;
      ulong b3 = Encrypt(roundKeys.Key2, b2, e, s, p) ^ b1;
      ulong b4 = Encrypt(roundKeys.Key3, b3, e, s, p) ^ b2;
      ulong b5 = Encrypt(roundKeys.Key4, b4, e, s, p) ^ b3;
      ulong b6 = Encrypt(roundKeys.Key5, b5, e, s, p) ^ b4;
      ulong b7 = Encrypt(roundKeys.Key6, b6, e, s, p) ^ b5;
      ulong b8 = Encrypt(roundKeys.Key7, b7, e, s, p) ^ b6;
      ulong b9 = Encrypt(roundKeys.Key8, b8, e, s, p) ^ b7;
      ulong b10 = Encrypt(roundKeys.Key9, b9, e, s, p) ^ b8;
      ulong b11 = Encrypt(roundKeys.Key10, b10, e, s, p) ^ b9;
      ulong b12 = Encrypt(roundKeys.Key11, b11, e, s, p) ^ b10;
      ulong b13 = Encrypt(roundKeys.Key12, b12, e, s, p) ^ b11;
      ulong b14 = Encrypt(roundKeys.Key13, b13, e, s, p) ^ b12;
      ulong b15 = Encrypt(roundKeys.Key14, b14, e, s, p) ^ b13;
      ulong b16 = Encrypt(roundKeys.Key15, b15, e, s, p) ^ b14;

      ulong result = b16 | (b15 >> 32);
      output[index] = Permutate(result, iip);
    }

    private static void Main(string[] args)
    {
      using (var context = new Context())
      using (var accelerator = new CudaAccelerator(context))
      {
        const int count = 10;
        var kernel = accelerator.LoadAutoGroupedStreamKernel<
          Index,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<byte>,
          ArrayView<ulong>,
          ArrayView<ulong>,
          ArrayView<ulong>
        >(Kernel);
        using (var pc1Buffer = accelerator.Allocate<byte>(PC1.Length))
        using (var pc2Buffer = accelerator.Allocate<byte>(PC2.Length))
        using (var ipBuffer = accelerator.Allocate<byte>(IP.Length))
        using (var iipBuffer = accelerator.Allocate<byte>(IIP.Length))
        using (var eBuffer = accelerator.Allocate<byte>(E.Length))
        using (var sBuffer = accelerator.Allocate<byte>(S.Length))
        using (var pBuffer = accelerator.Allocate<byte>(P.Length))
        using (var keysBuffer = accelerator.Allocate<ulong>(count))
        using (var blocksBuffer = accelerator.Allocate<ulong>(count))
        using (var outputBuffer = accelerator.Allocate<ulong>(count))
        {
          pc1Buffer.CopyFrom(PC1, 0, 0, PC1.Length);
          pc2Buffer.CopyFrom(PC2, 0, 0, PC2.Length);
          ipBuffer.CopyFrom(IP, 0, 0, IP.Length);
          iipBuffer.CopyFrom(IIP, 0, 0, IIP.Length);
          eBuffer.CopyFrom(E, 0, 0, E.Length);
          sBuffer.CopyFrom(S, 0, 0, S.Length);
          pBuffer.CopyFrom(P, 0, 0, P.Length);

          // Just for testing.
          ulong[] keys = Enumerable.Range(0, count).Select(x => (ulong)x).ToArray();
          keysBuffer.CopyFrom(keys, 0, 0, keys.Length);
          ulong[] blocks = Enumerable.Range(0, count).Select(x => (ulong)x).ToArray();
          blocksBuffer.CopyFrom(blocks, 0, 0, blocks.Length);

          var sw = new Stopwatch();
          sw.Start();
          kernel(
            count,
            pc1Buffer.View,
            pc2Buffer.View,
            ipBuffer.View,
            iipBuffer.View,
            eBuffer.View,
            sBuffer.View,
            pBuffer.View,
            keysBuffer.View,
            blocksBuffer.View,
            outputBuffer.View
          );
          accelerator.Synchronize();
          sw.Stop();

          Console.WriteLine("Kernel executed in {0} ms", sw.ElapsedMilliseconds);

          ulong[] output = outputBuffer.GetAsArray();
          for (int i = 0; i < 10; ++i) {
            Console.WriteLine("{0} {1} {2}", keys[i], blocks[i], output[i]);
          }
        }
      }
    }
  }
}

Raw Text