diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 9d806318..36edd03e 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -19,6 +19,7 @@ jobs: ['id'] ['Path'] ['ToolVersion'] + ['ASTVersion'] ['Modules']['a.b/c']['Dependencies']['a.b/c'] ['Modules']['a.b/c/cmdx']['Dependencies']['a.b/c/cmdx'] steps: diff --git a/docs/uniast-en.md b/docs/uniast-en.md index 81fb345e..ef34e742 100644 --- a/docs/uniast-en.md +++ b/docs/uniast-en.md @@ -1,4 +1,4 @@ -# Universal Abstract-Syntax-Tree Specification (v0.1.3) +# Universal Abstract-Syntax-Tree Specification (v0.2.0) Universal Abstract-Syntax-Tree is a LLM-friendly, language-agnostic code context data structure established by ABCoder. It represents a unified abstract syntax tree of a repository's code, collecting definitions of language entities (functions, types, constants/variables) and their interdependencies for subsequent AI understanding and coding-workflow development. @@ -370,6 +370,23 @@ Function type AST Node entity, corresponding to [NodeType] as FUNC, including fu - Vars: Global variables referenced within the current function, including variables and constants +- Extra: Additional information for storing language-specific details or extra metadata + + + - AnonymousFunctions: Anonymous functions defined in the function, each element is the FileLine of the corresponding function + + + - File: The filename where it is located + + + - Line: **Line number of the starting position in the file (starting from 1)** + + + - StartOffset: **Byte offset of the code starting position relative to the file header** + + + - EndOffset: **Byte offset of the code ending position relative to the file header** + ###### Dependency @@ -384,7 +401,10 @@ Represents a dependency relationship, containing the dependent node Id, dependen "File": "manager.go", "Line": 140, "StartOffset": 3547, - "EndOffset": 3564 + "EndOffset": 3564, + "Extra": { + "FunctionIsCall": true + } } ``` @@ -409,6 +429,12 @@ Represents a dependency relationship, containing the dependent node Id, dependen - EndOffset: Offset of the ending position of the dependency point (not the dependent node) token relative to the code file +- Extra: Additional information for storing language-specific details or extra metadata + + + - FunctionIsCall: If the Dependency is a function call, whether it actually executes the function call or just references the function + + ##### Type Type definition, [NodeType] is TYPE, including type definitions in specific languages such as structs, enums, interfaces, type aliases, etc. @@ -490,6 +516,9 @@ Type definition, [NodeType] is TYPE, including type definitions in specific lang - Implements: Which interfaces this type implements Identity +- Extra: Additional information for storing language-specific details or extra metadata + + ##### Var Global variables, including variables and constants, **but must be global** @@ -553,6 +582,24 @@ var x = getx(y db.Data) int { - Groups: Group definitions, such as `const( A=1, B=2, C=3)` in Go, Groups would be `[C=3, B=2]` (assuming A is the variable itself) +- Extra: Additional information for storing language-specific details or extra metadata + + + - AnonymousFunctions: Anonymous functions defined in the initialization function of the current variable. Each element is the FileLine of the corresponding function + + + - File: The filename where it is located + + + - Line: **Line number of the starting position in the file (starting from 1)** + + + - StartOffset: **Byte offset of the code starting position relative to the file header** + + + - EndOffset: **Byte offset of the code ending position relative to the file header** + + ### Graph The dependency topology graph of all AST Nodes in the repository. Formatted as Identity => Node mapping, where each Node contains dependency relationships with other nodes. diff --git a/docs/uniast-zh.md b/docs/uniast-zh.md index 0cca4928..8fb05aea 100644 --- a/docs/uniast-zh.md +++ b/docs/uniast-zh.md @@ -1,4 +1,4 @@ -# Universal Abstract-Syntax-Tree Specification (v0.1.3) +# Universal Abstract-Syntax-Tree Specification (v0.2.0) Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言无关的代码上下文数据结构,表示某个仓库代码的统一抽象语法树。收集了语言实体(函数、类型、常(变)量)的定义及其相互依赖关系,用于后续的 AI 理解、coding-workflow 开发。 @@ -371,12 +371,29 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言 - Vars: 当前函数内引用的全局量,包括变量和常量 +- Extra: 额外信息,用于存储一些语言特定的信息,或者是一些额外的元数据 + + + - AnonymousFunctions: 函数中所定义的匿名函数,每个元素为对应函数的 FileLine + + + - File: 所在的文件名 + + + - Line: **起始位置文件的行号(从1开始)** + + + - StartOffset: 代码起始位置**相对文件头的字节偏移量** + + + - EndOffset: 代码结束位置**相对文件头的字节偏移量** + ###### Dependency 表示一个依赖关系,包含依赖节点 Id、依赖产生位置等信息,方便 LLM 准确识别 -``` +```json { "ModPath": "github.com/cloudwego/localsession", "PkgPath": "github.com/cloudwego/localsession", @@ -384,7 +401,10 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言 "File": "manager.go", "Line": 140, "StartOffset": 3547, - "EndOffset": 3564 + "EndOffset": 3564, + "Extra": { + "FunctionIsCall": true + } } ``` @@ -409,6 +429,12 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言 - EndOffset: 依赖点(不是被依赖节点)token 结束位置相对代码文件的偏移 +- Extra: 额外信息,用于存储一些语言特定的信息,或者是一些额外的元数据 + + + - FunctionIsCall: 如果 Dependency 是一个函数调用,是否真正执行了函数调用,而不是只是引用了函数 + + ##### Type 类型定义,【NodeType】为 TYPE,包括具体语言中的类型定义,如 结构体、枚举、接口、类型别名等 @@ -490,6 +516,9 @@ Universal Abstract-Syntax-Tree 是 ABCoder 建立的一种 LLM 亲和、语言 - Implements: 该类型实现了哪些接口 **Identity** +- Extra: 额外信息,用于存储一些语言特定的信息,或者是一些额外的元数据 + + ##### Var 全局量,包括变量和常量,**但是必须是全局** @@ -553,6 +582,24 @@ var x = getx(y db.Data) int { - Groups: 同组定义, 如 Go 中的 `const( A=1, B=2, C=3)`,Groups 为 `[C=3, B=2]`(假设 A 为变量自身) +- Extra: 额外信息,用于存储一些语言特定的信息,或者是一些额外的元数据 + + + - AnonymousFunctions: 在当前变量的初始化函数中,所定义的匿名函数。每个元素为对应函数的 FileLine + + + - File: 所在的文件名 + + + - Line: **起始位置文件的行号(从1开始)** + + + - StartOffset: 代码起始位置**相对文件头的字节偏移量** + + + - EndOffset: 代码结束位置**相对文件头的字节偏移量** + + ### Graph 整个仓库的 AST Node 依赖拓扑图。形式为 Identity => Node 的映射,其中每个 Node 包含对其它节点的依赖关系。基于该拓扑图,可以实现**任意节点上下文的递归获取**。 diff --git a/go.mod b/go.mod index 60a83f02..3687cda2 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.23.4 require ( github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible + github.com/bytedance/sonic v1.14.1 github.com/cloudwego/eino v0.3.52 github.com/cloudwego/eino-ext/components/model/ark v0.1.16 github.com/cloudwego/eino-ext/components/model/claude v0.1.1 @@ -43,7 +44,6 @@ require ( github.com/bahlo/generic-list-go v0.2.0 // indirect github.com/buger/jsonparser v1.1.1 // indirect github.com/bytedance/gopkg v0.1.3 // indirect - github.com/bytedance/sonic v1.14.1 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cloudwego/base64x v0.1.6 // indirect diff --git a/lang/golang/parser/file.go b/lang/golang/parser/file.go index a2e8bac6..95d503f4 100644 --- a/lang/golang/parser/file.go +++ b/lang/golang/parser/file.go @@ -26,6 +26,11 @@ import ( . "github.com/cloudwego/abcoder/lang/uniast" ) +const ( + ExtraKey_FunctionIsCall = "FunctionIsCall" + ExtraKey_AnonymousFunctions = "AnonymousFunctions" +) + func (p *GoParser) parseFile(ctx *fileContext, f *ast.File) error { cont := true ast.Inspect(f, func(node ast.Node) bool { @@ -121,7 +126,9 @@ func (p *GoParser) parseVar(ctx *fileContext, vspec *ast.ValueSpec, isConst bool // collect func value dependencies, in case of var a = func() {...} if val != nil && !isConst { - collects := collectInfos{} + collects := collectInfos{ + directCalls: map[FileLine]bool{}, + } ast.Inspect(*val, func(n ast.Node) bool { return p.parseASTNode(ctx, n, &collects) }) @@ -137,6 +144,16 @@ func (p *GoParser) parseVar(ctx *fileContext, vspec *ast.ValueSpec, isConst bool for _, dep := range collects.tys { v.Dependencies = InsertDependency(v.Dependencies, dep) } + if len(collects.directCalls) > 0 { + for i, dep := range v.Dependencies { + if collects.directCalls[dep.FileLine] { + v.Dependencies[i].SetExtra(ExtraKey_FunctionIsCall, true) + } + } + } + if len(collects.anonymousFunctions) > 0 { + v.SetExtra(ExtraKey_AnonymousFunctions, collects.anonymousFunctions) + } } if vspec.Type != nil { @@ -392,12 +409,19 @@ func (p *GoParser) parseSelector(ctx *fileContext, expr *ast.SelectorExpr, infos type collectInfos struct { functionCalls, methodCalls []Dependency tys, globalVars []Dependency + + directCalls map[FileLine]bool + anonymousFunctions []FileLine // record anonymous function } func (p *GoParser) parseASTNode(ctx *fileContext, node ast.Node, collect *collectInfos) bool { switch expr := node.(type) { case *ast.SelectorExpr: return p.parseSelector(ctx, expr, collect) + case *ast.CallExpr: + p.parseCall(ctx, expr, collect) + case *ast.FuncLit: + collect.anonymousFunctions = append(collect.anonymousFunctions, ctx.FileLine(expr)) case *ast.Ident: callName := expr.Name // println("[parseFunc] ast.Ident:", callName) @@ -462,6 +486,22 @@ func (p *GoParser) parseASTNode(ctx *fileContext, node ast.Node, collect *collec return true } +// parseCall collect direct call info +func (p *GoParser) parseCall(ctx *fileContext, expr *ast.CallExpr, collect *collectInfos) { + var ident *ast.Ident + + switch idt := expr.Fun.(type) { + case *ast.Ident: + ident = idt + case *ast.SelectorExpr: + ident = idt.Sel + } + + if ident != nil { + collect.directCalls[ctx.FileLine(ident)] = true + } +} + // parseFunc parses all function declaration in one file func (p *GoParser) parseFunc(ctx *fileContext, funcDecl *ast.FuncDecl) (*Function, bool) { // method receiver @@ -511,7 +551,9 @@ func (p *GoParser) parseFunc(ctx *fileContext, funcDecl *ast.FuncDecl) (*Functio // collect content content := string(ctx.GetRawContent(funcDecl)) - collects := collectInfos{} + collects := collectInfos{ + directCalls: map[FileLine]bool{}, + } if funcDecl.Body == nil { goto set_func } @@ -521,7 +563,6 @@ func (p *GoParser) parseFunc(ctx *fileContext, funcDecl *ast.FuncDecl) (*Functio }) set_func: - if fname == "init" && p.repo.GetFunction(NewIdentity(ctx.module.Name, ctx.pkgPath, fname)) != nil { // according to https://go.dev/ref/spec#Program_initialization_and_execution, // duplicated init() is allowed and never be referenced, thus add a subfix @@ -544,6 +585,22 @@ set_func: f.Types = InsertDependency(f.Types, t) } f.Signature = string(sig) + + if len(collects.directCalls) > 0 { + for i, dep := range f.FunctionCalls { + if collects.directCalls[dep.FileLine] { + f.FunctionCalls[i].SetExtra(ExtraKey_FunctionIsCall, true) + } + } + for i, dep := range f.MethodCalls { + if collects.directCalls[dep.FileLine] { + f.MethodCalls[i].SetExtra(ExtraKey_FunctionIsCall, true) + } + } + } + if len(collects.anonymousFunctions) > 0 { + f.SetExtra(ExtraKey_AnonymousFunctions, collects.anonymousFunctions) + } return f, false } diff --git a/lang/uniast/ast.go b/lang/uniast/ast.go index 873364bb..ce30036c 100644 --- a/lang/uniast/ast.go +++ b/lang/uniast/ast.go @@ -23,6 +23,7 @@ import ( "strconv" "strings" + "github.com/bytedance/sonic" "golang.org/x/tools/go/packages" ) @@ -510,11 +511,14 @@ type Function struct { // func llm compress result CompressData *string `json:"compress_data,omitempty"` + + Extra *ExtraInfo `json:",omitempty"` } type Dependency struct { Identity FileLine `json:",omitempty"` + Extra *ExtraInfo `json:",omitempty"` } func (d Dependency) Id() Identity { @@ -607,6 +611,9 @@ type Type struct { // FieldFunctions map[string]string CompressData *string `json:"compress_data,omitempty"` // struct llm compress result + + // extra data + Extra *ExtraInfo `json:",omitempty"` } type Var struct { @@ -623,4 +630,95 @@ type Var struct { Groups []Identity `json:",omitempty"` CompressData *string `json:"compress_data,omitempty"` + + // extra data + Extra *ExtraInfo `json:",omitempty"` +} + +type ExtraInfo struct { + data map[string]any +} + +func (e *ExtraInfo) MarshalJSON() ([]byte, error) { + return sonic.Marshal(e.data) +} + +func (e *ExtraInfo) UnmarshalJSON(data []byte) error { + return sonic.Unmarshal(data, &e.data) +} + +func (t *Type) GetExtra(key string) any { + if t.Extra == nil { + return nil + } + if v, ok := t.Extra.data[key]; ok { + return v + } + return nil +} + +func (e *Type) SetExtra(key string, value any) { + if e.Extra == nil { + e.Extra = &ExtraInfo{ + data: make(map[string]any), + } + } + e.Extra.data[key] = value +} + +func (v *Var) GetExtra(key string) any { + if v.Extra == nil { + return nil + } + if v, ok := v.Extra.data[key]; ok { + return v + } + return nil +} + +func (v *Var) SetExtra(key string, value any) { + if v.Extra == nil { + v.Extra = &ExtraInfo{ + data: make(map[string]any), + } + } + v.Extra.data[key] = value +} + +func (f *Function) GetExtra(key string) any { + if f.Extra == nil { + return nil + } + if v, ok := f.Extra.data[key]; ok { + return v + } + return nil +} + +func (f *Function) SetExtra(key string, value any) { + if f.Extra == nil { + f.Extra = &ExtraInfo{ + data: make(map[string]any), + } + } + f.Extra.data[key] = value +} + +func (d *Dependency) GetExtra(key string) any { + if d.Extra == nil { + return nil + } + if v, ok := d.Extra.data[key]; ok { + return v + } + return nil +} + +func (d *Dependency) SetExtra(key string, value any) { + if d.Extra == nil { + d.Extra = &ExtraInfo{ + data: make(map[string]any), + } + } + d.Extra.data[key] = value } diff --git a/lang/uniast/version.go b/lang/uniast/version.go index 99a9f558..87a8373c 100644 --- a/lang/uniast/version.go +++ b/lang/uniast/version.go @@ -16,4 +16,4 @@ package uniast -const Version = "v0.1.4" +const Version = "v0.2.0"