在缩短URL等应用中经常要使用到base62编码,与base64编码不同的是去除了两个符号,使得码空间由数字和英文字母组成。网络上看到的几种编码和解码的算法。它们都使用了modulus,时间复杂度大约O(n^2)。另外,它们只能应用于长整形或其他整形的编码。而我需要512位编码,所以可以决定编写一个可以编码任意长度的字节数组的工具。GitHub: https://github.com/renmengye/base62-csharp
具体的算法我参照了IEEE上面的一篇论文[1],是将字节数组分为六位一组,由于62的码空间不足编六位(64),那么对于11111和11110开始的字位组,将第六位延后到下一组。这个算法时间复杂度可以达到O(n)。
首先,编写一个BitStream类方便我对字节数组每一位的读写。这也算是高级语言的黑色幽默吧。
///
/// Utility that read and write bits in byte array
///
public class BitStream : Stream
{
private byte[] Source { get; set; }
///
/// Initialize the stream with capacity
///
/// Capacity of the stream
public BitStream(int capacity)
{
this.Source = new byte[capacity];
}
///
/// Initialize the stream with a source byte array
///
///
public BitStream(byte[] source)
{
this.Source = source;
}
public override bool CanRead
{
get { return true; }
}
public override bool CanSeek
{
get { return true; }
}
public override bool CanWrite
{
get { return true; }
}
public override void Flush()
{
throw new NotImplementedException();
}
///
/// Bit length of the stream
///
public override long Length
{
get { return Source.Length * 8; }
}
///
/// Bit position of the stream
///
public override long Position { get; set; }
///
/// Read the stream to the buffer
///
/// Buffer
/// Offset bit start position of the stream
/// Number of bits to read
/// Number of bits read
public override int Read(byte[] buffer, int offset, int count)
{
// Temporary position cursor
long tempPos = this.Position;
tempPos += offset;
// Buffer byte position and in-byte position
int readPosCount = 0, readPosMod = 0;
// Stream byte position and in-byte position
long posCount = tempPos >> 3;
int posMod = (int)(tempPos - ((tempPos >> 3) << 3));
while (tempPos < this.Position + offset + count && tempPos < this.Length)
{
// Copy the bit from the stream to buffer
if ((((int)this.Source[posCount]) & (0x1 << (7 - posMod))) != 0)
{
buffer[readPosCount] = (byte)((int)(buffer[readPosCount]) | (0x1 << (7 - readPosMod)));
}
else
{
buffer[readPosCount] = (byte)((int)(buffer[readPosCount]) & (0xffffffff - (0x1 << (7 - readPosMod))));
}
// Increment position cursors
tempPos++;
if (posMod == 7)
{
posMod = 0;
posCount++;
}
else
{
posMod++;
}
if (readPosMod == 7)
{
readPosMod = 0;
readPosCount++;
}
else
{
readPosMod++;
}
}
int bits = (int)(tempPos - this.Position - offset);
this.Position = tempPos;
return bits;
}
///
/// Set up the stream position
///
/// Position
/// Position origin
/// Position after setup
public override long Seek(long offset, SeekOrigin origin)
{
switch (origin)
{
case (SeekOrigin.Begin):
{
this.Position = offset;
break;
}
case (SeekOrigin.Current):
{
this.Position += offset;
break;
}
case (SeekOrigin.End):
{
this.Position = this.Length + offset;
break;
}
}
return this.Position;
}
public override void SetLength(long value)
{
throw new NotImplementedException();
}
///
/// Write from buffer to the stream
///
///
/// Offset start bit position of buffer
/// Number of bits
public override void Write(byte[] buffer, int offset, int count)
{
// Temporary position cursor
long tempPos = this.Position;
// Buffer byte position and in-byte position
int readPosCount = offset >> 3, readPosMod = offset - ((offset >> 3) << 3);
// Stream byte position and in-byte position
long posCount = tempPos >> 3;
int posMod = (int)(tempPos - ((tempPos >> 3) << 3));
while (tempPos < this.Position + count && tempPos < this.Length)
{
// Copy the bit from buffer to the stream
if ((((int)buffer[readPosCount]) & (0x1 << (7 - readPosMod))) != 0)
{
this.Source[posCount] = (byte)((int)(this.Source[posCount]) | (0x1 << (7 - posMod)));
}
else
{
this.Source[posCount] = (byte)((int)(this.Source[posCount]) & (0xffffffff - (0x1 << (7 - posMod))));
}
// Increment position cursors
tempPos++;
if (posMod == 7)
{
posMod = 0;
posCount++;
}
else
{
posMod++;
}
if (readPosMod == 7)
{
readPosMod = 0;
readPosCount++;
}
else
{
readPosMod++;
}
}
this.Position = tempPos;
}
}
接下来可以着手编写两个扩展方法: ToBase62和FromBase62。
///
/// Extension methods for base62 encoding
///
public static class EncodingExtensions
{
private static string Base62CodingSpace = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
///
/// Convert a byte array
///
/// Byte array
/// Base62 string
public static string ToBase62(this byte[] original)
{
StringBuilder sb = new StringBuilder();
BitStream stream = new BitStream(original); // Set up the BitStream
byte[] read = new byte[1]; // Only read 6-bit at a time
while (true)
{
read[0] = 0;
int length = stream.Read(read, 0, 6); // Try to read 6 bits
if (length == 6) // Not reaching the end
{
if ((int)(read[0] >> 3) == 0x1f) // First 5-bit is 11111
{
sb.Append(Base62CodingSpace[61]);
stream.Seek(-1, SeekOrigin.Current); // Leave the 6th bit to next group
}
else if ((int)(read[0] >> 3) == 0x1e) // First 5-bit is 11110
{
sb.Append(Base62CodingSpace[60]);
stream.Seek(-1, SeekOrigin.Current);
}
else // Encode 6-bit
{
sb.Append(Base62CodingSpace[(int)(read[0] >> 2)]);
}
}
else
{
// Padding 0s to make the last bits to 6 bit
sb.Append(Base62CodingSpace[(int)(read[0] >> (int)(8 - length))]);
break;
}
}
return sb.ToString();
}
///
/// Convert a Base62 string to byte array
///
/// Base62 string
/// Byte array
public static byte[] FromBase62(this string base62)
{
// Character count
int count = 0;
// Set up the BitStream
BitStream stream = new BitStream(base62.Length * 6 / 8);
foreach (char c in base62)
{
// Look up coding table
int index = Base62CodingSpace.IndexOf(c);
// If end is reached
if (count == base62.Length - 1)
{
// Check if the ending is good
int mod = (int)(stream.Position % 8);
stream.Write(new byte[] { (byte)(index << (mod)) }, 0, 8 - mod);
}
else
{
// If 60 or 61 then only write 5 bits to the stream, otherwise 6 bits.
if (index == 60)
{
stream.Write(new byte[] { 0xf0 }, 0, 5);
}
else if (index == 61)
{
stream.Write(new byte[] { 0xf8 }, 0, 5);
}
else
{
stream.Write(new byte[] { (byte)index }, 2, 6);
}
}
count++;
}
// Dump out the bytes
byte[] result = new byte[stream.Position / 8];
stream.Seek(0, SeekOrigin.Begin);
stream.Read(result, 0, result.Length * 8);
return result;
}
}
下面的代码是使用base62编码的实例:
[TestMethod]
public void EncodeBasics()
{
string s = (new byte[] { 116, 32, 8, 99, 100, 232, 4, 7 }).ToBase62();
Assert.AreEqual("T208OsJe107", s);
}
[TestMethod]
public void DecodeBasics()
{
byte[] result = "T208OsJe107".FromBase62();
Assert.IsTrue(result.SequenceEqual(new byte[] { 116, 32, 8, 99, 100, 232, 4, 7 }));
}
参考文献
[1] Kejing He, Xiancheng Xu and Qiang Yue, A Secure, Lossless, and Compressed Base62 Encoding, ICCS 2008.
Nice style, I’ll look at the algorithm a bit later, but O(n) is a lot faster. Bitstream is so cheap in C#…