一个高度压缩的bit位图字典的实现

微软实现的字典功能简单方便,出于全局性考虑,其内部实现比较复杂,在对海量数据处理时,仅仅存放一个对象地址就将占据32个bit(64位机器中占据64个bit),而且其内部通过int[]来处理hash桶,占用空间较大。

如果对于一些特定的海量数据的处理进行字典的自我定制,将极大地节省空间。

最近做的一个项目中字典中存放的数据达到几百万条,不同日期数值重复率极大,故实现了一个定制字典,实际结果和普通的Dictionay比较,压缩率达到一半以上。

该字典针对key为DateTime类型的数据进行了优化:同一个value的地址(value为引用类型)只保存一次(普通的字典中key不同value相同时,value都指向同一个引用类型的地址,即有多少个key则value的地址会保存多少份,同时还会维护一个hash桶),通过维护一个bitMap来标明该对象适用的日期,并做了一些相应的优化(如全局bitmap的实现)。同时,比较日期是否适用时全部采用位操作,因为代码在64位机器上运行,故bit操作时都以64bit作为操作对象。

这种字典数据重复率越高,压缩约明显,所以需要针对数据特征判断该实现是否适用。

理论上,只要key在一个有限可量化的集合里,都可以实现类似的字典。

key为dateTime的实现如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Data; namespace BitmapDictionary
{
class Program
{ /// <summary>
///Author: zhao-ewwj@163.com
///AuthorDate: 2015-01-22 14:12:10 +0800
///Commit: zhao-ewwj@163.com
///CommitDate: 2015-01-22 14:12:10 +0800
///根据酒店数据特征定制的字典,key为日期形式,key与value以bit位的形式对应,value以链表的形式组织
/// <typeparam name="TKey"></typeparam>
/// <typeparam name="TValue"></typeparam>
public class BitMapDictionary<TValue> : IDictionary<DateTime, TValue>
{ /// <summary>
/// 判断字典中元素相等
/// </summary>
/// <param name="t1"></param>
/// <param name="t2"></param>
/// <returns></returns>
private static bool ObjEqual(TValue t1, TValue t2)
{
return t1 == null ? t2 == null : t1.Equals(t2);
} public BitMapDictionary()
: this(DateTime.Now.AddDays(-), ObjEqual)
{
} public BitMapDictionary(int days)
: this(DateTime.Now.AddDays(days), ObjEqual)
{
} public BitMapDictionary(BitMapDictionary<TValue> dictionary)
: this(dictionary.relativeDateTime, dictionary.checkEqual)
{
this.GlobalBitmap = dictionary.GlobalBitmap;
if (dictionary.Head == null)
{
this.Head = null;
return;
}
this.Head = dictionary.Head.Clone() as Entry;
var newCurrent = this.Head;
var oldCurrent = dictionary.Head;
while (oldCurrent.next != null)
{
newCurrent.next = oldCurrent.next.Clone() as Entry;
newCurrent = newCurrent.next;
oldCurrent = oldCurrent.next;
}
} /// <summary>
/// 构造器
/// </summary>
/// <param name="relativeDateTime">相对时间</param>
/// <param name="checkEqual">比较对象相等</param>
public BitMapDictionary(DateTime dateTime, Func<TValue, TValue, bool> checkEqual)
{
this.relativeDateTime = dateTime;
this.checkEqual = checkEqual;
} /// <summary>
/// 构造器
/// </summary>
/// <param name="relativeDateTime">相对时间</param>
/// <param name="checkEqual">比较对象相等</param>
public BitMapDictionary(Dictionary<DateTime, TValue> dictionary, DateTime dateTime, Func<TValue, TValue, bool> checkEqual)
: this(dateTime,checkEqual)
{
foreach (var pair in dictionary)
{
this[pair.Key] = pair.Value;
}
} /// <summary>
/// 相对时间
/// </summary>
private DateTime relativeDateTime;
/// <summary>
/// 相等比较函数
/// </summary>
private Func<TValue, TValue, bool> checkEqual;
/// <summary>
/// 链表头:第一版本,使用线性链表,遍历查询,待续..
/// </summary>
public Entry Head;
/// <summary>
/// 链表元素
/// </summary>
public class Entry
{
public TValue value;
public BitMap bitMap;
public Entry next;
/// <summary>
/// 复制
/// </summary>
/// <returns>返回复制</returns>
public object Clone()
{
return this.MemberwiseClone();
}
} //预存256天的价格数据的bit位图
public struct BitMap
{
public ulong Days193To256;
public ulong Days129To192;
public ulong Day65To128;
public ulong Day1To64; /// <summary>
/// 将Bit置1
/// </summary>
/// <param name="days"></param>
public void SetBit(int day)
{
int index = day & 0x3F;//得到mod 64的结果
int unit = day >> ;//得到除64的整数结果
ulong subCode = ;
subCode <<= index;
switch (unit)
{
case :
this.Days193To256 |= subCode;
break;
case :
this.Days129To192 |= subCode;
break;
case :
this.Day65To128 |= subCode;
break;
case :
this.Day1To64 |= subCode;
break;
}
} /// <summary>
/// 将Bit置0
/// </summary>
/// <param name="day"></param>
public void ResetBit(int day)
{
int index = day & 0x3F;//得到mod 64的结果
int unit = day >> ;//得到除64的整数结果
ulong subCode = ;
subCode <<= index;
subCode = ulong.MaxValue ^ subCode;//异或取掩码
switch (unit)
{
case :
this.Days193To256 &= subCode;
break;
case :
this.Days129To192 &= subCode;
break;
case :
this.Day65To128 &= subCode;
break;
case :
this.Day1To64 &= subCode;
break;
}
} /// <summary>
/// 看Bit是否是1
/// </summary>
/// <param name="day"></param>
/// <returns></returns>
public bool CheckBit(int day)
{
bool exists = false;
int index = day & 0x3F;//得到mod 64的结果
int unit = day >> ;//得到除64的整数结果
ulong subCode = ;
subCode <<= index;
switch (unit)
{
case :
exists = (this.Days193To256 & subCode) >= ;
break;
case :
exists = (this.Days129To192 & subCode) >= ;
break;
case :
exists = (this.Day65To128 & subCode) >= ;
break;
case :
exists = (this.Day1To64 & subCode) >= ;
break;
}
return exists;
} public bool Equals(BitMap bitmap)
{
return (bitmap.Days193To256 == this.Days193To256
&& bitmap.Days129To192 == this.Days129To192
&& bitmap.Day65To128 == this.Day65To128
&& bitmap.Day1To64 == this.Day1To64);
}
} /// <summary>
/// 全局位图信息,提高性能
/// </summary>
public BitMap GlobalBitmap; /// <summary>
/// 索引器
/// </summary>
/// <param name="key"></param>
/// <returns></returns>
public TValue this[DateTime key]
{
get
{
int relativeDay = this.RelativeDay(key);
if (this.GlobalBitmap.CheckBit(relativeDay))
{
var current = this.Head;
while (current != null)
{
if (current.bitMap.CheckBit(relativeDay))
{
return current.value;
}
current = current.next;
}
//如果代码执行到这里,说明全局bit位图有而实际没有,此时,纠正全局比特位图
this.GlobalBitmap.ResetBit(relativeDay);
}
throw new Exception(BitMapDicException.NotFountElement);
}
set
{
this.AddElement(this.RelativeDay(key), value);
}
} /// <summary>
/// 向链表添加,判断是否重复
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
public void Add(DateTime key, TValue value)
{
int relativeDay = this.RelativeDay(key);
if (this.GlobalBitmap.CheckBit(relativeDay))
{
throw new Exception(BitMapDicException.AlreadyExists);
}
this.AddElement(relativeDay, value);
} /// <summary>
/// 校验全局位图是否有误
/// </summary>
/// <returns></returns>
public bool CheckGlobalBitmap()
{
BitMap bitmapChk = new BitMap();
var current = this.Head;
int i = ;
while (current != null)
{
bitmapChk.Day1To64 |= current.bitMap.Day1To64;
bitmapChk.Day65To128 |= current.bitMap.Day65To128;
bitmapChk.Days129To192 |= current.bitMap.Days129To192;
bitmapChk.Days193To256 |= current.bitMap.Days193To256;
current = current.next;
i++;
}
return this.GlobalBitmap.Equals(bitmapChk);
} /// <summary>
/// 无条件向链表添加,不判断是否重复(++先添加,在删除)
/// </summary>
/// <param name="relativeDay"></param>
/// <param name="value"></param>
public void AddElement(int relativeDay, TValue value)
{
if (this.Head == null)
{
this.Head = new Entry() { value = value };
this.Head.bitMap.SetBit(relativeDay);
}
else
{
Entry before = null;
var current = this.Head;
Entry bookUpCurrent = null;//添加时保留旧对象
Entry bookUpBefore = null;//添加时保留旧对象前一个对象
bool dayNotFound = true;
bool objNotEqual = true;
while (true)
{
if (objNotEqual)
{
//如果对象相等,则设置objNotEqual = false,看他的bitMap是否已设置,有,则设置dayNotFound =false并break;没有则设置它的bitMap并continue;
if (this.checkEqual(current.value, value))
{
objNotEqual = false;
if (current.bitMap.CheckBit(relativeDay))
{
break;
}
else
{
current.bitMap.SetBit(relativeDay);
if (current.next == null)
{
break;
}
else
{
before = current;
current = current.next;
}
}
}
}
if (dayNotFound)
{
//如果日期发现已设,则看是否已有相等对象,如果有,则直接reset bitmap并break;如果没有,则继续进行bit clear操作;
if (current.bitMap.CheckBit(relativeDay))
{
dayNotFound = false;
if (false == objNotEqual)
{
//先不做bit操作,保留现场
bookUpCurrent = current;
bookUpBefore = before;
break;
}
else
{
//先不做bit操作,保留现场
bookUpCurrent = current;
bookUpBefore = before;
}
}
}
if (current.next == null)
{
break;
}
else
{
before = current;
current = current.next;
}
}
if (objNotEqual)
{
//如果一直没有找到相等元素,则直接在链表最后追加一个,且设置一下bitMap
current.next = new Entry() { value = value };
current.next.bitMap.SetBit(relativeDay);
}
//对保留的现场进行处理
if (bookUpCurrent != null)
{
bookUpCurrent.bitMap.ResetBit(relativeDay);
DeleteElementFromChain(bookUpCurrent, bookUpBefore);
}
}
//添加成功,全局位设置
this.GlobalBitmap.SetBit(relativeDay);
} /// <summary>
/// 看Key是否存在
/// </summary>
/// <param name="key"></param>
/// <returns></returns>
public bool ContainsKey(DateTime key)
{
int relativeDay = this.RelativeDay(key);
//直接看全局bitMap,有则直接返回
if (this.GlobalBitmap.CheckBit(relativeDay))
{
return true;
}
return false;
} /// <summary>
/// 去掉某天数据
/// </summary>
/// <param name="key"></param>
/// <returns></returns>
public bool Remove(DateTime key)
{
int relativeDay = this.RelativeDay(key);
if (this.GlobalBitmap.CheckBit(relativeDay))
{
var current = this.Head;
Entry before = null;
while (current != null)
{
if (current.bitMap.CheckBit(relativeDay))
{
current.bitMap.ResetBit(relativeDay);
DeleteElementFromChain(current, before);
break;
}
else
{
before = current;
current = current.next;
}
}
//纠正全局比特位图
this.GlobalBitmap.ResetBit(relativeDay);
}
return false;
} /// <summary>
/// 获取某天数据
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
/// <returns></returns>
public bool TryGetValue(DateTime key, out TValue value)
{
value = default(TValue);
int relativeDay = this.RelativeDay(key);
if (this.GlobalBitmap.CheckBit(relativeDay))
{
var current = this.Head;
while (current != null)
{
if (current.bitMap.CheckBit(relativeDay))
{
value = current.value;
return true;
}
else
{
current = current.next;
}
}
//如果代码执行到这里,说明全局bit位图有而实际没有,此时,纠正全局比特位图
this.GlobalBitmap.ResetBit(relativeDay);
}
return false;
} /// <summary>
/// 统计key value数量
/// </summary>
public int Count
{
get
{
int num;
int num2 = ;
int bitA = GetBitNum(this.GlobalBitmap.Days193To256);
int bitB = GetBitNum(this.GlobalBitmap.Days129To192);
int bitC = GetBitNum(this.GlobalBitmap.Day65To128);
int bitD = GetBitNum(this.GlobalBitmap.Day1To64);
num = bitA + bitB + bitC + bitD;
var current = this.Head;
while (current != null)
{
num2++;
current = current.next;
}
if (num != num2)
{
//写报警
}
return num;
}
}
/// <summary>
/// 非只读
/// </summary>
public bool IsReadOnly
{
get
{
return false;
}
} /// <summary>
/// 清空链表
/// </summary>
public void Clear()
{
this.GlobalBitmap = new BitMap();
this.Head = null;
} #region 功能函数
/// <summary>
/// 去掉某个失效元素
/// </summary>
/// <param name="current"></param>
/// <param name="before"></param>
public void DeleteElementFromChain(Entry current, Entry before)
{
//如果去掉当日生效后当前实体对任何一天都不生效,则remove实体
if (current.bitMap.Days193To256 == && current.bitMap.Days129To192 == && current.bitMap.Day65To128 == && current.bitMap.Day1To64 == )
{
if (before == null)
{
this.Head = current.next;
}
else
{
before.next = current.next;
}
}
} /// <summary>
/// 获取时间差,并按位取反,如设定相对时间是2015-1-5,则2015-1-5号则记为第1天,2015-1-6号则记为第2天
/// </summary>
/// <param name="dateTime"></param>
/// <returns></returns>
public int RelativeDay(DateTime dateTime)
{
int relativedays = -;
if (dateTime >= this.relativeDateTime)
{
relativedays = (dateTime - this.relativeDateTime).Days;
}
if (relativedays >= )
{
throw new Exception(BitMapDicException.DateOver);
}
else if (relativedays == -)
{
throw new Exception(BitMapDicException.DateLow);
}
return - relativedays;
} /// <summary>
/// 取得bit为1的个数
/// </summary>
/// <param name="dayBit"></param>
/// <returns></returns>
private int GetBitNum(ulong dayBit)
{
int count = ;
ulong subCode = ;
for (int i = ; i < ; i++)
{
if ((dayBit & subCode) >= )
{
count++;
}
subCode <<= ;
}
return count;
} #endregion #region 未实现的一些方法
public void Add(KeyValuePair<DateTime, TValue> item)
{
throw new Exception(BitMapDicException.NotImplement);
} public bool Contains(KeyValuePair<DateTime, TValue> item)
{
throw new Exception(BitMapDicException.NotImplement);
} public ICollection<DateTime> Keys
{
get
{
//Keys接口暂不实现,如果日后需要实现,可通过Global比特位图的方式实现ICollection接口,不要
throw new Exception(BitMapDicException.KeysNotImplement);
}
} public ICollection<TValue> Values
{
get
{
//values接口暂不实现,如果日后需要实现,可通过Head头的方式实现ICollection接口
throw new Exception(BitMapDicException.ValuesNotImplement);
}
} public bool Remove(KeyValuePair<DateTime, TValue> item)
{
throw new Exception(BitMapDicException.AlreadyExists);
} public void CopyTo(KeyValuePair<DateTime, TValue>[] array, int arrayIndex)
{
throw new Exception(BitMapDicException.AlreadyExists);
} IEnumerator IEnumerable.GetEnumerator()
{
throw new Exception(BitMapDicException.AlreadyExists);
} IEnumerator<KeyValuePair<DateTime, TValue>> IEnumerable<KeyValuePair<DateTime, TValue>>.GetEnumerator()
{
throw new Exception(BitMapDicException.AlreadyExists);
}
#endregion public class BitMapDicException
{
public static readonly String ChkTkey = "Tkey必须是时间类型";
public static readonly String ChkEqualFunc ="必须设定相等比较方法";
public static readonly String KeysNotImplement = "Keys接口不实现";
public static readonly String ValuesNotImplement ="Values接口不实现";
public static readonly String NotFountElement = "索引器未发现指定元素";
public static readonly String DateOver = "data OutOfRange最大只能存放256天数据";
public static readonly String DateLow = "data OutOfRange不能存放相对日期之前的数据";
public static readonly String AlreadyExists ="元素已存在";
public static readonly String NotImplement = "该成员未实现";
public static readonly String ValueCannotBeNull = "如果要去掉某个元素,请用Remove功能!赋值null表示增加一个value为null的元素,对定制的字典而言无意义!";
public static readonly String NotExistsEqualFunction ="不存在对象相等比较函数!";
}
} /// <summary>
/// Main中包含了一些基本的单元测试
/// </summary>
/// <param name="args"></param>
static void Main(string[] args)
{
RoomInfo r1 = new RoomInfo();
RoomInfo r2 = new RoomInfo();
Console.WriteLine("== 检查:" + (r1 == r2));
Console.WriteLine("Equals检查:" + r1.Equals(r2));
//添加300天
BitMapDictionary<RoomInfo> dic = new BitMapDictionary<RoomInfo>(-);
for (int i = ; i <= ; i++)
{
dic.Add(DateTime.Now.AddDays(i - ), new RoomInfo() { FGPrice = (i - ) * 0.1m, PPPrice = (i - ) * 0.1m + 0.1m });
}
//全局位是否正确
bool globalCheck = true;
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); BitMapDictionary<RoomInfo> dic2 = new BitMapDictionary<RoomInfo>(dic);
dic = dic2;
//看看取出来和刚才放进去是否一致;
bool checkElement = true;
for (int i = ; i <= ; i++)
{
RoomInfo roomTemp;
dic.TryGetValue(DateTime.Now.AddDays(i - ), out roomTemp);
checkElement = checkElement && RoomInfo.Equals(roomTemp, new RoomInfo() { FGPrice = (i - ) * 0.1m, PPPrice = (i - ) * 0.1m + 0.1m });
}
Console.WriteLine("对象逐步比较:" + checkElement);
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); ///各自移除第一天,看看bit位对不对
for (int i = ; i < ; i++)
{
dic.Remove(DateTime.Now.AddDays(i * - ));
}
Console.WriteLine("设置了256天,各自减去最高位,检查:" + (dic.GlobalBitmap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0x7fffffffffffffff, Day65To128 = 0x7fffffffffffffff, Days129To192 = 0x7fffffffffffffff, Days193To256 = 0x7fffffffffffffff })));
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); for (int i = ; i <= ; i++)
{
RoomInfo roomTemp;
dic.Remove(DateTime.Now.AddDays(i - ));
}
Console.WriteLine("移除所有数据,字典数量为0:" + (dic.Count == )); //检查字典global
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); ///将增加逻辑改成索引器
for (int i = ; i <= ; i++)
{
dic[DateTime.Now.AddDays(i - )] = new RoomInfo() { FGPrice = (i - ) * 0.1m, PPPrice = (i - ) * 0.1m + 0.1m };
}
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("改成索引器赋值后,全局检查:" + globalCheck); ///添加256天,看看是不是满位
Console.WriteLine("设置了256天,是否满位:" + (dic.GlobalBitmap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0xffffffffffffffff, Day65To128 = 0xffffffffffffffff, Days129To192 = 0xffffffffffffffff, Days193To256 = 0xffffffffffffffff })));
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); //将每个bit组中的第一个作为共享对象,其它的和他一样,看看是不是共享了
for (int i = ; i <= ; i++)
{
int unit = (int)(i / );//得到除64的整数结果
dic[DateTime.Now.AddDays(i - )] = new RoomInfo() { FGPrice = (unit - ) * 0.1m, PPPrice = (unit - ) * 0.1m + 0.1m };
}
bool global = true;
if (!dic.Head.bitMap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0xffffffffffffffff, Day65To128 = 0x0, Days129To192 = 0x0, Days193To256 = 0x0 })
|| !dic.Head.next.bitMap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0x0, Day65To128 = 0xffffffffffffffff, Days129To192 = 0x0, Days193To256 = 0x0 })
|| !dic.Head.next.next.bitMap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0x0, Day65To128 = 0x0, Days129To192 = 0xffffffffffffffff, Days193To256 = 0x0 })
|| !dic.Head.next.next.next.bitMap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0x0, Day65To128 = 0x0, Days129To192 = 0x0, Days193To256 = 0xffffffffffffffff })
|| !dic.GlobalBitmap.Equals(new BitMapDictionary<RoomInfo>.BitMap() { Day1To64 = 0xffffffffffffffff, Day65To128 = 0xffffffffffffffff, Days129To192 = 0xffffffffffffffff, Days193To256 = 0xffffffffffffffff })
)
{
global = false;
}
Console.WriteLine("对象共享检查:" + global);
///在上面的基础上,将2-10改成一天数据,看看是否共享了对象
globalCheck = globalCheck && dic.CheckGlobalBitmap();
Console.WriteLine("全局检查:" + globalCheck); for (int i = ; i <= ; i++)
{
dic[DateTime.Now.AddDays(i - )] = null;
} Console.WriteLine("测试结束");
Console.ReadKey();
}
}
public class RoomInfo
{
public decimal PPPrice;
public decimal FGPrice; public override bool Equals(Object r1)
{
RoomInfo r = (RoomInfo)r1;
return r != null && r.PPPrice == this.PPPrice && r.FGPrice == this.FGPrice;
} public override int GetHashCode()
{
return this.FGPrice.GetHashCode() ^ this.PPPrice.GetHashCode();
}
}
}

System.Configuration.ConfigurationManager.AppSettings.Get("QueryInfoBusinType")

上一篇:iOS runtime实用篇解决常见Crash


下一篇:以图搜图(一):Python实现dHash算法(转)