Base62编码的C#实现

在缩短URL等应用中经常要使用到base62编码,与base64编码不同的是去除了两个符号,使得码空间由数字和英文字母组成。网络上看到的几种编码和解码的算法。它们都使用了modulus,时间复杂度大约O(n^2)。另外,它们只能应用于长整形或其他整形的编码。而我需要512位编码,所以可以决定编写一个可以编码任意长度的字节数组的工具。GitHub: https://github.com/renmengye/base62-csharp

具体的算法我参照了IEEE上面的一篇论文[1],是将字节数组分为六位一组,由于62的码空间不足编六位(64),那么对于11111和11110开始的字位组,将第六位延后到下一组。这个算法时间复杂度可以达到O(n)。

首先,编写一个BitStream类方便我对字节数组每一位的读写。这也算是高级语言的黑色幽默吧。

    /// 
    /// Utility that read and write bits in byte array
    /// 
    public class BitStream : Stream
    {
        private byte[] Source { get; set; }

        /// 
        /// Initialize the stream with capacity
        /// 
        /// Capacity of the stream
        public BitStream(int capacity)
        {
            this.Source = new byte[capacity];
        }

        /// 
        /// Initialize the stream with a source byte array
        /// 
        /// 
        public BitStream(byte[] source)
        {
            this.Source = source;
        }

        public override bool CanRead
        {
            get { return true; }
        }

        public override bool CanSeek
        {
            get { return true; }
        }

        public override bool CanWrite
        {
            get { return true; }
        }

        public override void Flush()
        {
            throw new NotImplementedException();
        }

        /// 
        /// Bit length of the stream
        /// 
        public override long Length
        {
            get { return Source.Length * 8; }
        }

        /// 
        /// Bit position of the stream
        /// 
        public override long Position { get; set; }

        /// 
        /// Read the stream to the buffer
        /// 
        /// Buffer
        /// Offset bit start position of the stream
        /// Number of bits to read
        /// Number of bits read
        public override int Read(byte[] buffer, int offset, int count)
        {
            // Temporary position cursor
            long tempPos = this.Position;
            tempPos += offset;

            // Buffer byte position and in-byte position
            int readPosCount = 0, readPosMod = 0;

            // Stream byte position and in-byte position
            long posCount = tempPos >> 3;
            int posMod = (int)(tempPos - ((tempPos >> 3) << 3));

            while (tempPos < this.Position + offset + count && tempPos < this.Length)
            {
                // Copy the bit from the stream to buffer
                if ((((int)this.Source[posCount]) & (0x1 << (7 - posMod))) != 0)
                {
                    buffer[readPosCount] = (byte)((int)(buffer[readPosCount]) | (0x1 << (7 - readPosMod)));
                }
                else
                {
                    buffer[readPosCount] = (byte)((int)(buffer[readPosCount]) & (0xffffffff - (0x1 << (7 - readPosMod))));
                }

                // Increment position cursors
                tempPos++;
                if (posMod == 7)
                {
                    posMod = 0;
                    posCount++;
                }
                else
                {
                    posMod++;
                }
                if (readPosMod == 7)
                {
                    readPosMod = 0;
                    readPosCount++;
                }
                else
                {
                    readPosMod++;
                }
            }
            int bits = (int)(tempPos - this.Position - offset);
            this.Position = tempPos;
            return bits;
        }

        /// 
        /// Set up the stream position
        /// 
        /// Position
        /// Position origin
        /// Position after setup
        public override long Seek(long offset, SeekOrigin origin)
        {
            switch (origin)
            {
                case (SeekOrigin.Begin):
                    {
                        this.Position = offset;
                        break;
                    }
                case (SeekOrigin.Current):
                    {
                        this.Position += offset;
                        break;
                    }
                case (SeekOrigin.End):
                    {
                        this.Position = this.Length + offset;
                        break;
                    }
            }
            return this.Position;
        }

        public override void SetLength(long value)
        {
            throw new NotImplementedException();
        }

        /// 
        /// Write from buffer to the stream
        /// 
        /// 
        /// Offset start bit position of buffer
        /// Number of bits
        public override void Write(byte[] buffer, int offset, int count)
        {
            // Temporary position cursor
            long tempPos = this.Position;

            // Buffer byte position and in-byte position
            int readPosCount = offset >> 3, readPosMod = offset - ((offset >> 3) << 3);

            // Stream byte position and in-byte position
            long posCount = tempPos >> 3;
            int posMod = (int)(tempPos - ((tempPos >> 3) << 3));

            while (tempPos < this.Position + count && tempPos < this.Length)
            {
                // Copy the bit from buffer to the stream
                if ((((int)buffer[readPosCount]) & (0x1 << (7 - readPosMod))) != 0)
                {
                    this.Source[posCount] = (byte)((int)(this.Source[posCount]) | (0x1 << (7 - posMod)));
                }
                else
                {
                    this.Source[posCount] = (byte)((int)(this.Source[posCount]) & (0xffffffff - (0x1 << (7 - posMod))));
                }

                // Increment position cursors
                tempPos++;
                if (posMod == 7)
                {
                    posMod = 0;
                    posCount++;
                }
                else
                {
                    posMod++;
                }
                if (readPosMod == 7)
                {
                    readPosMod = 0;
                    readPosCount++;
                }
                else
                {
                    readPosMod++;
                }
            }
            this.Position = tempPos;
        }
    }

接下来可以着手编写两个扩展方法: ToBase62和FromBase62。

    /// 
    /// Extension methods for base62 encoding
    /// 
    public static class EncodingExtensions
    {
        private static string Base62CodingSpace = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

        /// 
        /// Convert a byte array
        /// 
        /// Byte array
        /// Base62 string
        public static string ToBase62(this byte[] original)
        {
            StringBuilder sb = new StringBuilder();
            BitStream stream = new BitStream(original);         // Set up the BitStream
            byte[] read = new byte[1];                          // Only read 6-bit at a time
            while (true)
            {
                read[0] = 0;
                int length = stream.Read(read, 0, 6);           // Try to read 6 bits
                if (length == 6)                                // Not reaching the end
                {
                    if ((int)(read[0] >> 3) == 0x1f)            // First 5-bit is 11111
                    {
                        sb.Append(Base62CodingSpace[61]);
                        stream.Seek(-1, SeekOrigin.Current);    // Leave the 6th bit to next group
                    }
                    else if ((int)(read[0] >> 3) == 0x1e)       // First 5-bit is 11110
                    {
                        sb.Append(Base62CodingSpace[60]);
                        stream.Seek(-1, SeekOrigin.Current);
                    }
                    else                                        // Encode 6-bit
                    {
                        sb.Append(Base62CodingSpace[(int)(read[0] >> 2)]);
                    }
                }
                else
                {
                    // Padding 0s to make the last bits to 6 bit
                    sb.Append(Base62CodingSpace[(int)(read[0] >> (int)(8 - length))]);
                    break;
                }
            }
            return sb.ToString();
        }

        /// 
        /// Convert a Base62 string to byte array
        /// 
        /// Base62 string
        /// Byte array
        public static byte[] FromBase62(this string base62)
        {
            // Character count
            int count = 0;

            // Set up the BitStream
            BitStream stream = new BitStream(base62.Length * 6 / 8);

            foreach (char c in base62)
            {
                // Look up coding table
                int index = Base62CodingSpace.IndexOf(c);

                // If end is reached
                if (count == base62.Length - 1)
                {
                    // Check if the ending is good
                    int mod = (int)(stream.Position % 8);
                    stream.Write(new byte[] { (byte)(index << (mod)) }, 0, 8 - mod);
                }
                else
                {
                    // If 60 or 61 then only write 5 bits to the stream, otherwise 6 bits.
                    if (index == 60)
                    {
                        stream.Write(new byte[] { 0xf0 }, 0, 5);
                    }
                    else if (index == 61)
                    {
                        stream.Write(new byte[] { 0xf8 }, 0, 5);
                    }
                    else
                    {
                        stream.Write(new byte[] { (byte)index }, 2, 6);
                    }
                }
                count++;
            }

            // Dump out the bytes
            byte[] result = new byte[stream.Position / 8];
            stream.Seek(0, SeekOrigin.Begin);
            stream.Read(result, 0, result.Length * 8);
            return result;
        }
    }

下面的代码是使用base62编码的实例:

        [TestMethod]
        public void EncodeBasics()
        {
            string s = (new byte[] { 116, 32, 8, 99, 100, 232, 4, 7 }).ToBase62();
            Assert.AreEqual("T208OsJe107", s);
        }
        [TestMethod]
        public void DecodeBasics()
        {
            byte[] result = "T208OsJe107".FromBase62();
            Assert.IsTrue(result.SequenceEqual(new byte[] { 116, 32, 8, 99, 100, 232, 4, 7 }));
        }

参考文献
[1] Kejing He, Xiancheng Xu and Qiang Yue, A Secure, Lossless, and Compressed Base62 Encoding, ICCS 2008.

“Base62编码的C#实现”上的一条回复

评论已关闭。