SystemTap 使用技巧之三

2017-09-04 11:50:16 +08:00
 sherryxueli

接上篇~

7.14 修改进程中的变量

root@j9 ~# cat stap_set_var.c -n     
     1  #include <stdio.h>
     2
     3  typedef struct policy {
     4      int     id;
     5  } policy_t;
     6
     7  int main(int argc, char *argv[])
     8  {
     9      policy_t policy;
    10      policy_t *p = &policy;
    11      policy_t **pp;
    12
    13      p->id = 111;
    14
    15      printf("before stap set, p->id: %d\n", p->id);
    16
    17      pp = &p;
    18
    19      printf("after stap set, p->id: %d, (*pp)->id: %d\n", p->id, (*pp)->id);
    20
    21      return 0;
    22  }

root@j9 ~# gcc -Wall -g -o ./stap_set_var ./stap_set_var.c      

root@j9 ~# cat stap_set_var.stp
probe process("./stap_set_var").statement("main@./stap_set_var.c:17")
{
    $p->id = 222;
    printf("$p$: %s\n", $p$)
}

root@j9 ~# stap -g stap_set_var.stp -c ./stap_set_var         
before stap set, p->id: 111
after stap set, p->id: 222, (*pp)->id: 222
$p$: {.id=222}

root@j9 ~#

可以看出在第 17 行用 SystemTap 修改后的值在第 19 行就生效了。 需要注意的是 stap 要加-g 参数在 guru 模式下才能修改变量的值。

7.15 跟踪进程执行流程

thread_indent(n): 补充空格 ppfunc(): 当前探测点所在的函数 在 call 探测点调用 thread_indent(4)补充 4 个空格,在 return 探测点调用 thread_indent(-4)回退 4 个空格,效果如下:

#cat trace_nginx.stp
probe process("/home/admin/tengine/bin/nginx").function("*@src/http/ngx_http_*").call
{
    printf("%s -> %s\n", thread_indent(4), ppfunc());
}

probe process("/home/admin/tengine/bin/nginx").function("*@src/http/ngx_http_*").return
{
    printf("%s <- %s\n", thread_indent(-4), ppfunc());
}

#stap trace_nginx.stp
     0 nginx(11368):    -> ngx_http_init_connection
    21 nginx(11368):    <- ngx_http_init_connection
     0 nginx(11368):    -> ngx_http_wait_request_handler
    30 nginx(11368):        -> ngx_http_create_request
    41 nginx(11368):        <- ngx_http_create_request
    55 nginx(11368):        -> ngx_http_process_request_line
    72 nginx(11368):            -> ngx_http_read_request_header
    78 nginx(11368):            <- ngx_http_read_request_header
    91 nginx(11368):            -> ngx_http_parse_request_line
    99 nginx(11368):            <- ngx_http_parse_request_line
   109 nginx(11368):            -> ngx_http_process_request_uri
   115 nginx(11368):            <- ngx_http_process_request_uri
   127 nginx(11368):            -> ngx_http_process_request_headers
   138 nginx(11368):                -> ngx_http_read_request_header
   143 nginx(11368):                <- ngx_http_read_request_header
   155 nginx(11368):                -> ngx_http_parse_header_line
   163 nginx(11368):                <- ngx_http_parse_header_line
   178 nginx(11368):                -> ngx_http_process_user_agent
   185 nginx(11368):                <- ngx_http_process_user_agent
   192 nginx(11368):                -> ngx_http_parse_header_line
   198 nginx(11368):                <- ngx_http_parse_header_line
   208 nginx(11368):                -> ngx_http_process_host
   222 nginx(11368):                    -> ngx_http_validate_host
   229 nginx(11368):                    <- ngx_http_validate_host
   239 nginx(11368):                    -> ngx_http_set_virtual_server
   252 nginx(11368):                        -> ngx_http_find_virtual_server
   259 nginx(11368):                        <- ngx_http_find_virtual_server
   263 nginx(11368):                    <- ngx_http_set_virtual_server
   266 nginx(11368):                <- ngx_http_process_host
   274 nginx(11368):                -> ngx_http_parse_header_line
   279 nginx(11368):                <- ngx_http_parse_header_line
   287 nginx(11368):                -> ngx_http_parse_header_line
   292 nginx(11368):                <- ngx_http_parse_header_line

   .....

  2072 nginx(11368):                                <- ngx_http_finalize_request
  2076 nginx(11368):                            <- ngx_http_core_content_phase
  2079 nginx(11368):                        <- ngx_http_core_run_phases
  2083 nginx(11368):                    <- ngx_http_handler
  2093 nginx(11368):                    -> ngx_http_run_posted_requests
  2100 nginx(11368):                    <- ngx_http_run_posted_requests
  2103 nginx(11368):                <- ngx_http_process_request
  2107 nginx(11368):            <- ngx_http_process_request_headers
  2111 nginx(11368):        <- ngx_http_process_request_line
  2114 nginx(11368):    <- ngx_http_wait_request_handler
     0 nginx(11368):    -> ngx_http_keepalive_handler
    26 nginx(11368):        -> ngx_http_close_connection
    79 nginx(11368):        <- ngx_http_close_connection
    83 nginx(11368):    <- ngx_http_keepalive_handler

7.16 查看代码执行路径

pp(): 输出当前被激活的探测点

#cat ngx_http_process_request.stp
probe process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:*") {
    printf("%s\n", pp())
}

#stap ngx_http_process_request.stp 
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2762")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2768")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2771")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2773")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2774")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2783")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2835")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2840")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2841")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2842")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2843")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2846")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2847")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2848")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2850")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2852")
process("/home/admin/tengine/bin/nginx").statement("ngx_http_process_request@src/http/ngx_http_request.c:2853")
^C

可以看出该函数哪些行被执行了。

7.17 巧用正则匹配过滤

在排查问题时,可以利用一些正则匹配来获取自己想要的信息,比如下面是只收集*.j9.com 的堆栈:

#
cat debug_tengine_5xx.stp 
probe process("/home/admin/tengine/bin/t-coresystem-tengine-cdn").function("ngx_http_finalize_request").call {
    rc = $rc
    if (rc < 0) {
        host = "(null)"
        if ($r->headers_in->server->len != 0) {
            host = user_string_n($r->headers_in->server->data, $r->headers_in->server->len)
        } else {
            cscf = &@cast($r->srv_conf, "ngx_http_core_srv_conf_t")[@var("ngx_http_core_module@src/http/ngx_http_core_module.c")->ctx_index]
            if (cscf->server_name->len != 0) {
                 host = user_string_n(cscf->server_name->data, cscf->server_name->len)
            }
        }

        if (host =~ ".*\.j9\.com") {
            printf("rc: %d, host: %s\n", rc, host)
            print_ubacktrace()
        }
    }
}

#stap debug_tengine_5xx.stp
WARNING: Missing unwind data for module, rerun with 'stap -d /lib64/libc-2.12.so'
rc: -4, host: www.j9.com
 0x49af2e : ngx_http_finalize_request+0xe/0x480 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x492eab : ngx_http_core_content_phase+0x2b/0x130 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x48e74d : ngx_http_core_run_phases+0x3d/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x514c3c : ngx_http_lua_socket_tcp_read+0x44c/0x590 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x513150 : ngx_http_lua_socket_tcp_handler+0x30/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x475b96 : ngx_event_process_posted+0x36/0x40 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47d4d8 : ngx_worker_process_cycle+0x138/0x260 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47a38a : ngx_spawn_process+0x1ca/0x5e0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47c73c : ngx_start_worker_processes+0x7c/0x100 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47db5f : ngx_master_process_cycle+0x3af/0x9b0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x45a740 : main+0xa90/0xb50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x3623e1ecdd [/lib64/libc-2.12.so+0x1ecdd/0x38d000]
rc: -4, host: cdn.j9.com
 0x49af2e : ngx_http_finalize_request+0xe/0x480 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x492eab : ngx_http_core_content_phase+0x2b/0x130 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x48e74d : ngx_http_core_run_phases+0x3d/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x514c3c : ngx_http_lua_socket_tcp_read+0x44c/0x590 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x513150 : ngx_http_lua_socket_tcp_handler+0x30/0x50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x475b96 : ngx_event_process_posted+0x36/0x40 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47d4d8 : ngx_worker_process_cycle+0x138/0x260 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47a38a : ngx_spawn_process+0x1ca/0x5e0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47c73c : ngx_start_worker_processes+0x7c/0x100 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x47db5f : ngx_master_process_cycle+0x3af/0x9b0 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x45a740 : main+0xa90/0xb50 [/home/admin/tengine/bin/t-coresystem-tengine-cdn]
 0x3623e1ecdd [/lib64/libc-2.12.so+0x1ecdd/0x38d000]

7.18 关联数组用法

SystemTap 的关联数组必须是全局变量,需要用 global 进行声明,其索引可以支持多达 9 项索引域,各域间以逗号隔开。支持 =, ++ 与 +=操作,其默认的初始值为 0。 例如:

root@j9 ~# cat stap_array.stp 
global reads
probe vfs.read {
    reads[execname(), pid()] ++
}
probe timer.s(3) {
    foreach ([execname, pid] in reads) {
        printf("%s(%d) : %d \n", execname, pid, reads[execname, pid])
    }
    print("============================\n")
    delete reads
}

root@j9 ~# stap stap_array.stp 
stapio(18716) : 16 
rsyslogd(770) : 1 
docker(743) : 3 
IFSWatch(5594) : 30 
QThread(5594) : 6 
AliYunDunUpdate(1057) : 4 
sshd(15118) : 1 
sshd(15191) : 1 
============================
stapio(18716) : 16 
sshd(15191) : 3 
docker(743) : 3 
IFSWatch(5594) : 30 
sshd(15118) : 2 
QThread(5594) : 12 
AliYunDunUpdate(1057) : 8 
============================
^C
root@j9 ~/systemtap#

也可以用+、-进行排序:

root@j9 ~# cat stap_array.stp
global reads
probe vfs.read {
    reads[execname(), pid()] ++
}
probe timer.s(3) {
    foreach ([execname, pid+] in reads) {
        printf("%s(%d) : %d \n", execname, pid, reads[execname, pid])
    }
    print("============================\n")
    delete reads
}

root@j9 ~# stap stap_array.stp 
docker(743) : 3 
rsyslogd(770) : 1 
AliYunDunUpdate(1057) : 12 
IFSWatch(5594) : 30 
QThread(5594) : 12 
sshd(15118) : 2 
sshd(15191) : 2 
stapio(19021) : 16 
============================
docker(743) : 3 
AliYunDunUpdate(1057) : 12 
IFSWatch(5594) : 30 
QThread(5594) : 6 
sshd(15118) : 1 
sshd(15191) : 19 
stapio(19021) : 16 
============================
^C
root@j9 ~#

7.19 调试内存泄漏以及内存重复释放

probe begin {
    printf("=============begin============\n")
}

//记录内存分配和释放的计数关联数组
global g_mem_ref_tbl
//记录内存分配和释放的调用堆栈关联数组
global g_mem_bt_tbl

probe process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_malloc").return, process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_calloc").return {
    if (target() == pid()) {
        if (g_mem_ref_tbl[$return] == 0) {
            g_mem_ref_tbl[$return]++
            g_mem_bt_tbl[$return] = sprint_ubacktrace()
        }
    }
}

probe process("/lib/x86_64-linux-gnu/libc.so.6").function("__libc_free").call {
    if (target() == pid()) {
        g_mem_ref_tbl[$mem]--

        if (g_mem_ref_tbl[$mem] == 0) {
            if ($mem != 0) {
                //记录上次释放的调用堆栈
                g_mem_bt_tbl[$mem] = sprint_ubacktrace()
            }
        } else if (g_mem_ref_tbl[$mem] < 0 && $mem != 0) {
            //如果调用 free 已经失衡,那就出现了重复释放内存的问题,这里输出当前调用堆栈,以及这个地址上次释放的调用堆栈
            printf("MMMMMMMMMMMMMMMMMMMMMMMMMMMM\n")
            printf("g_mem_ref_tbl[%p]: %d\n", $mem, g_mem_ref_tbl[$mem])
            print_ubacktrace()
            printf("last free backtrace:\n%s\n", g_mem_bt_tbl[$mem])
            printf("WWWWWWWWWWWWWWWWWWWWWWWWWWWW\n")
        }
    }
}

probe end {
    //最后输出产生泄漏的内存是在哪里分配的
    printf("=============end============\n")
    foreach(mem in g_mem_ref_tbl) {
        if (g_mem_ref_tbl[mem] > 0) {
            printf("%s\n", g_mem_bt_tbl[mem])
        }
    }
}

详细请看: http://blog.csdn.net/wangzuxi/article/details/44901285

7.20 嵌入 C 代码

在进程 fork 出子进程时打印出进程 id 和进程名:

root@jusse ~/systemtap# cat copy_process.stp
function getprocname:string(task:long)
%{
    struct task_struct *task = (struct task_struct *)STAP_ARG_task;
    snprintf(STAP_RETVALUE, MAXSTRINGLEN, "pid: %d, comm: %s", task->pid, task->comm);
%}

function getprocid:long(task:long)
%{
    struct task_struct *task = (struct task_struct *)STAP_ARG_task;
    STAP_RETURN(task->pid);
%}

probe kernel.function("copy_process").return
{
    printf("copy_process return: %p, pid: %d, getprocname: %s, getprocid: %d\n", $return, $return->pid, getprocname($return), getprocid($return));
}
root@jusse ~/systemtap# stap -g copy_process.stp
copy_process return: 0xffff880039f61800, pid: 12212, getprocname: pid: 12212, comm: bash, getprocid: 12212
copy_process return: 0xffff880039f61800, pid: 12212, getprocname: pid: 12212, comm: bash, getprocid: 12212
copy_process return: 0xffff880039f63000, pid: 12213, getprocname: pid: 12213, comm: cc_epoll, getprocid: 12213
copy_process return: 0xffff880039f63000, pid: 12213, getprocname: pid: 12213, comm: cc_epoll, getprocid: 12213
copy_process return: 0xffff8800081a9800, pid: 12214, getprocname: pid: 12214, comm: cc_epoll, getprocid: 12214
copy_process return: 0xffff8800081a9800, pid: 12214, getprocname: pid: 12214, comm: cc_epoll, getprocid: 12214
copy_process return: 0xffff8800004d8000, pid: 12215, getprocname: pid: 12215, comm: cc_epoll, getprocid: 12215
copy_process return: 0xffff8800004d8000, pid: 12215, getprocname: pid: 12215, comm: cc_epoll, getprocid: 12215
copy_process return: 0xffff880000564800, pid: 12216, getprocname: pid: 12216, comm: cc_epoll, getprocid: 12216
copy_process return: 0xffff880000564800, pid: 12216, getprocname: pid: 12216, comm: cc_epoll, getprocid: 12216
copy_process return: 0xffff880000566000, pid: 12217, getprocname: pid: 12217, comm: cc_epoll, getprocid: 12217
copy_process return: 0xffff880000566000, pid: 12217, getprocname: pid: 12217, comm: cc_epoll, getprocid: 12217

有三个需要注意的地方: 1 )、SystemTap 脚本里面嵌入 C 语言代码要在每个大括号前加%前缀,是%{…… %} 而不是%{ …… }%; 2 )、获取脚本函数参数要用 STAP_ARG_前缀; 3 )、一般 long 等返回值用 STAP_RETURN,而 string 类型返回值要用 snprintf、strncat 等方式把字符串复制到 STAP_RETVALUE 里面。

7.21 调试内核模块

这小节就不细讲了,这篇博客 (http://blog.chinaunix.net/uid-14528823-id-4726046.html) 写得很详细,这里只 copy 两个关键点过来记录一下: 要调试自己的内核模块,需要注意的有两个关键点: 1)、使用 SystemTap 调试内核模块,探测点的编写格式示例为: module("ext3").function("ext3_*") 2)、需要将自己的模块 cp 到 /lib/modules/uname -r/extra 目录中,否则找不到符号,如果 /lib/modules/uname -r/目录下没有 extra 这个目录,自己 mkdir 一下就可以。

7.22 一些错误提示及解决办法

错误提示 1:

ERROR: MAXACTION exceeded near keyword at debug_connection.stp:86:9
ERROR: MAXACTION exceeded near operator '->' at debug_connection.stp:84:30

解决办法: 加上 stap 参数:-DMAXACTION=102400,如果还报这种类型的错误,只需把 102400 调成更大的值即可。

错误提示 2:

WARNING: Number of errors: 0, skipped probes: 82

解决办法: 加上-DMAXSKIPPED=102400 和-DSTP_NO_OVERLOAD 参数

还有一些可以去掉限制的宏:

MAXSTRINGLEN:这个宏会影响 sprintf 的 buffer 大小,默认为 512 字节。 MAXTRYLOCK:对全局变量进行 try lock 操作的次数,超过则次数还拿不到锁则放弃和跳过该探测点,默认值为 1000.全局变量多的时候可以把这个宏开大一点。

(完)

4306 次点击
所在节点    推广
0 条回复

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/387987

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX