1. Introduction to Pycparser
Project Links: https://github.com/eliben/pycparser
Pycparser is a parser of C language. It supports the complete C99 standard and is written in pure Python.
It is very convenient to parse and process C language source code, such as generating AST, extracting function call relationship in source code, etc.
Pycparser is very easy to use. You need to focus on the examples directory and the c_ast.py file.
2. Source Code Interpretation
-
Interpretation of c_ast.py File
_ c_ast.cfg and c_ast.py provide the syntax and implementation of C99, such as the description of IF statements by _c_ast.cfg:
If: [cond*, iftrue*, iffalse*]
Represents that IF nodes are composed of three sub-nodes, condition, iftrue and iffalse, which are equivalent to the description of BNF paradigm.
Definition of IF node in c_ast.py:
class If(Node): __slots__ = ('cond', 'iftrue', 'iffalse', 'coord', '__weakref__') def __init__(self, cond, iftrue, iffalse, coord=None): self.cond = cond self.iftrue = iftrue self.iffalse = iffalse self.coord = coord def children(self): nodelist = [] if self.cond is not None: nodelist.append(("cond", self.cond)) if self.iftrue is not None: nodelist.append(("iftrue", self.iftrue)) if self.iffalse is not None: nodelist.append(("iffalse", self.iffalse)) return tuple(nodelist) def __iter__(self): if self.cond is not None: yield self.cond if self.iftrue is not None: yield self.iftrue if self.iffalse is not None: yield self.iffalse attr_names = ()
It is found that the _init_ method has one coord node besides three sub-nodes, which is used to represent the location information of the node in the source code, such as the code line number.
Observe the children method and manage the child nodes with nodelist. Each child node is represented by tuple. For example, conditional statement is represented by ("cond", self.cond). self.cond is the real condition node, and cond is the TAG of the node.
Referring to the examples file, we can extract all If nodes from the source code of C language:
def find_If(node,if_list): if node is None: return if isinstance(node,c_ast.If): if_list.append(node.cond) # iterator its children for item in node.children(): # deep search # item is a tuple , item[0] is type, item[1] is Node t_node = item[1] if isinstance(t_node, c_ast.If): if_list.append(t_node.cond) find_If(t_node.iftrue,if_list) find_If(t_node.iffalse,if_list) else: find_If(t_node,if_list) filename = "notes.c" ifcondList = [] ast = parse_file(filename, use_cpp=True) find_If(ast,ifcondList)
After extracting the If node, you can do a lot of things, such as output all the conditional statements of the code, as follows:
from pycparser import c_parser, c_ast, parse_file, c_generator generator = c_generator.CGenerator() for cond_tuple in ifcondList: cond_node = cond_tuple[1] #Each Item consists of a tuple, and the second element is the real Node. cond_code = generator.visit(cond_node) #Code to get conditions print(cond_code)
Processing of conditional nodes can also be further analyzed, such as extracting constants and operators in conditions, etc.
3. Implementing function call relation function in cflow tool
Pycparser has an application of visitor pattern design pattern to parse target nodes, using func_calls.py and func_defs.py files in the project example directory.
from __future__ import print_function import sys import re import json sys.path.extend(['.', '..']) from pycparser import c_parser, c_ast, parse_file, c_generator def extract_funcDef(node,defList): if node is None: return childrens = [item[1] for item in node.children()] for item in childrens: if isinstance(item,c_ast.FuncDef): defList.append(item) else: extract_funcDef(item,defList) def extract_funcCall(node,funcList): if isinstance(node, c_ast.Node): # for AST node node = (node,None) if node[0] is None: return childrens = [item[1] for item in node[0].children()] for item in childrens: if isinstance(item, c_ast.FuncCall): funcList.append(item) else: extract_funcCall(item,funcList) class FuncDefVisitor(c_ast.NodeVisitor): def __init__(self,funcname,funcList): self.funcname = funcname self.funcList = funcList def visit_FuncDef(self, node): if node.decl.name == self.funcname: extract_funcCall(node,self.funcList) # print('%s at %s' % (node.decl.name, node.decl.coord)) def show_deflist(defList): for defFunc in defList: name = defFunc.decl.name # print(name,defFunc.decl.coord) # pass def show_func_defs(ast, funcname,the_dict,invoke_dict): # ast = parse_file(filename, use_cpp=True) funcList = [] v = FuncDefVisitor(funcname,funcList) v.visit(ast) # print(len(funcList)) invoke_dict[funcname] = [func.name.name for func in funcList] for func in funcList: try: the_dict[func.name.name].append(funcname) except Exception as e: the_dict[func.name.name] = [funcname] # raise e # print('funcDefs:',func.name.name,func.name.coord) if __name__ == '__main__': filename = "./codes/notes.c" defList = [] the_dict = {} invoke_dict = {} ast = parse_file(filename, use_cpp=True) extract_funcDef(ast,defList) # print(len(defList)) show_deflist(defList) nameList = [item.decl.name for item in defList] for name in nameList: show_func_defs(ast,name,the_dict,invoke_dict) # parser(filename) print('====Ref_dict====') for k,v in the_dict.items(): print('{}:{}'.format(k,v)) print('====Invoke_dict====') for k,v in invoke_dict.items(): print('{}:{}'.format(k,v))
The output results are as follows:
In the next article, I'll show you how to use Invoke_dict to generate a call graph. We need to use graphviz. This part has been implemented and you can see the result.