DEFINE_bool(malloc_page_fence, EnvToBool("TCMALLOC_PAGE_FENCE", false), "Enables putting of memory allocations at page boundaries " "with a guard page following the allocation (to catch buffer " "overruns right when they happen)."); 把falseҎtrue可以了?br />惌在项目里加入PageHeap功能Q只需要链接的时候加?-ltcmalloc_debug卛_。把它加入项目中Q试着q行下,直接挂了Q?br />仔细一看,原来是项目中很多成员变量没有初始化导致的Qtcmalloc_debug会自动将new 和malloc出来的内存初始化为指定|q样Q一旦变量没有初始化Q很Ҏ暴露了?br />修改完这个问题后Q编译,再运行,q是挂,q个是mprotect的时候挂的,错误是内存不够,q怎么可能呢,其实是达C资源限制了?br />echo 128000 > /proc/sys/vm/max_map_count 把map数量限制加大,再运行,OK了!
if (idx < TVR_SIZE) { int i = expires & TVR_MASK; vec = base->tv1.vec + i; } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { int i = (expires >> TVR_BITS) & TVN_MASK; vec = base->tv2.vec + i; } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; vec = base->tv3.vec + i; } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; vec = base->tv4.vec + i; } else if ((signed long) idx < 0) { /* * Can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; /* If the timeout is larger than 0xffffffff on 64-bit * architectures then we use the maximum timeout: */ if (idx > 0xffffffffUL) { idx = 0xffffffffUL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } /* * Timers are FIFO: */ list_add_tail(&timer->entry, vec); } 从上可以看到Linux对定时器的处理:对即在TVR_SIZE
个jiffies内到辄定时dQ将它挂到第一ltv1
下,具体是挂到expires & TVR_MASK
对应的列表上厅R?br>同一个jiffies到达的定时器是挂在同一个链表的?br>同理Q挂到第二个l的?到期旉于 1 << (TVR_BITS + TVN_BITS) jiffies的?br>挂到W三个组的是 到期旉于1 << (TVR_BITS + 2 * TVN_BITS)
jiffies的?br>挂到W四个组的是 到期旉于 1 << (TVR_BITS + 3 * TVN_BITS)
jiffies的?br>过1 << (TVR_BITS + 3 * TVN_BITS) 的挂到第五组?br>q样Q所有到期的d都会在第一l。Q何时刻都可以直接通过当前jiffies&TVR_SIZE
来找到需要运行的定时器Q务列表,定时器的插入效率是O(1)?br> 下面是定时器的运行代码: static int cascade(struct tvec_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ struct timer_list *timer, *tmp; struct list_head tv_list;
list_replace_init(tv->vec + index, &tv_list);
/* * We are removing _all_ timers from the list, so we * don't have to detach them individually. */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); internal_add_timer(base, timer); }
/** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. * * This function cascades all vectors and executes all expired timer * vectors. */ static inline void __run_timers(struct tvec_base *base) { struct timer_list *timer;
spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; const struct file_operations *f_op = NULL; struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += __NFDBITS; continue; }
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { int fput_needed; if (i >= n) break; if (!(bit & all_bits)) continue; file = fget_light(i, &fput_needed); if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) mask = (*f_op->poll)(file, retval ? NULL : wait); fput_light(file, fput_needed); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; } } } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; }
/* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec_to_ktime(*end_time); to = &expire; }
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; } __set_current_state(TASK_RUNNING);
/* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = tfile->f_op->poll(tfile, &epq.pt);
q里也是调用文gpȝ的poll函数Q不q这ơ初始化了一个结构,q个l构会带有一个poll函数的callback函数Qep_ptable_queue_procQ?br>在调用poll函数的时候,会执行这个callbackQ这个callback的功能就是将当前q程d?socket的等待进E上?br>static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } }
注意到参?whead
实际上是 sk->sleepQ其实就是将当前q程d到sk的等待队列里Q当该socket收到数据或者其他事件触发时Q会调用 sock_def_readable
或者sock_def_write_space
通知函数来唤醒等待进E,q?个函数都是在socket创徏的时候填充在skl构里的?br>从前面的分析来看Qepoll实是比select聪明的多、轻杄多,不用再苦哈哈的去轮询了?br>
一、用方?/h1>
打开主页Q由于公司网l禁止SVN从外部更斎ͼ所以只能下载了打包的源代码。解压后Q看到有个doc目录Q进去,打开使用文档Q发C用方法极为简单: To use TCMalloc, just link TCMalloc into your application via the
"-ltcmalloc" linker flag.再看法Q也没什么特别的Q还是和slab以及SGI STL分配器类似的法?br>unix环境居然只要链接q个tcmalloc库就可以了!Q太方便了,不过我手头没有linux环境Q文档上也没提到windows环境怎么使用Q?br>打开源代码包Q有个vs2003解决ҎQ打开Q随便挑选一个测试项目,查看目属性,发现仅仅?点不同: 1、链接器命o行里多了 "..\..\release\libtcmalloc_minimal.lib"Q就是链接的时候依赖了q个内存优化库?br>2、链接器->输入->强制W号引用 多了 __tcmalloc?br>q样可以正的使用tcmalloc库了Q测试了下,试目q行OK!
如前面所qͼtcmalloc实用了很hacker的办法来实现无缝的替换系l自带的内存分配函数Q本人在使用q类技术通常是用来干坏事的。。。)Q但是这也不以解释Z么它的效率比我们自己的好那么多?br>回到tcmalloc 的手册,tcmalloc除了使用常规的小内存理外,对多U程环境做了Ҏ处理Q这和我原来见到的内存分配器大有不同Q一般的内存分配器作者都会偷懒,把多U程问题扔给使用者,大多是加 个bool型的模板参数来表C是否是多线E环境,q美其名?可定Ӟ末了q得吹嘘下模板的优越性?br>tcmalloc是怎么做的呢? {案是每U程一个ThreadCacheQ大部分操作pȝ都会支持thread local storage 是传说中的TLS,q样可以实现每U程一个分配器了, q样Q不同线E分配都是在各自的threadCache里分配的。我们的目的分配器׃是多U程环境的,所以不三七二十一Q全都加锁了Q性能自然׃了?br> 仅仅是如此,q是不以将tcmalloc和ptmalloc2分个高下Q后者也是每个线E都有threadCache的?br>关于q个问题Qdoc里有一D说明,原文贴出来: ptmalloc2 also reduces lock contention by using per-thread arenas but
there is a big problem with ptmalloc2's use of per-thread arenas. In
ptmalloc2 memory can never move from one arena to another. This can
lead to huge amounts of wasted space. 大意是这LQptmalloc2 也是通过tls来降低线E锁Q但是ptmalloc2各个U程的内存是独立的,也就是说Q第一个线E申L内存Q释攄时候还是必L到第一个线E池中(不可UdQ,q样可能D大量内存费?br>
设想下这样一个情景:假如有一个dll 在tcmalloc之前加蝲Qƈ且在分配了内存(使用crt提供的mallocQ,那么在加载tcmalloc后,tcmalloc会替换所有的free函数Q然后,在某个时刻, 在前面的那个dll代码中释放该内存Q这岂不是很危险。实际测试发现没有Q何问题,关键在这里: span = Static::pageheap()->GetDescriptor(p); if (!span) { // span can be NULL because the pointer passed in is invalid // (not something returned by malloc or friends), or because the // pointer was allocated with some other allocator besides // tcmalloc. The latter can happen if tcmalloc is linked in via // a dynamic library, but is not listed last on the link line. // In that case, libraries after it on the link line will // allocate with libc malloc, but free with tcmalloc's free. (*invalid_free_fn)(ptr); // Decide how to handle the bad free request return; }
tcmalloc会通过span识别q个内存是否自己分配的,如果不是Qtcmalloc会调用该dll原始对应函数(q个很重?释放。这样就解决了这个棘手的问题?br>