diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/analyze/analyze.go | 208 | ||||
| -rw-r--r-- | internal/analyze/analyze_test.go | 174 | ||||
| -rw-r--r-- | internal/analyze/resources.go | 26 | ||||
| -rw-r--r-- | internal/collect/collect.go | 331 | ||||
| -rw-r--r-- | internal/nlp/dictionary.go | 614 | ||||
| -rw-r--r-- | internal/nlp/dictionary_test.go | 165 | ||||
| -rw-r--r-- | internal/report/report.go | 80 |
7 files changed, 1598 insertions, 0 deletions
diff --git a/internal/analyze/analyze.go b/internal/analyze/analyze.go new file mode 100644 index 0000000..315f086 --- /dev/null +++ b/internal/analyze/analyze.go @@ -0,0 +1,208 @@ +package analyze + +import ( + "github.com/Fuwn/kivia/internal/collect" + "strings" + "unicode" + "unicode/utf8" +) + +type Options struct { + MinEvaluationLength int +} + +type Result struct { + Violations []Violation `json:"violations"` +} + +type Violation struct { + Identifier collect.Identifier `json:"identifier"` + Reason string `json:"reason"` +} + +func Run(identifiers []collect.Identifier, options Options) (Result, error) { + minimumEvaluationLength := options.MinEvaluationLength + + if minimumEvaluationLength <= 0 { + minimumEvaluationLength = 1 + } + + resources, err := getResources() + + if err != nil { + return Result{}, err + } + + violations := make([]Violation, 0) + + for _, identifier := range identifiers { + if utf8.RuneCountInString(strings.TrimSpace(identifier.Name)) < minimumEvaluationLength { + continue + } + + evaluation := evaluateIdentifier(identifier, resources, minimumEvaluationLength) + + if !evaluation.isViolation { + continue + } + + violation := Violation{ + Identifier: identifier, + Reason: evaluation.reason, + } + violations = append(violations, violation) + } + + return Result{Violations: violations}, nil +} + +type evaluationResult struct { + isViolation bool + reason string +} + +func evaluateIdentifier(identifier collect.Identifier, resources resources, minimumTokenLength int) evaluationResult { + name := strings.TrimSpace(identifier.Name) + + if name == "" { + return evaluationResult{} + } + + tokens := tokenize(name) + + if len(tokens) == 0 { + return evaluationResult{} + } + + for _, token := range tokens { + if utf8.RuneCountInString(token) < minimumTokenLength { + continue + } + + if !isAlphabeticToken(token) { + continue + } + + if resources.dictionary.IsWord(token) { + continue + } + + if isUpperCaseToken(name, token) { + continue + } + + if isDisallowedAbbreviation(token, resources) { + return evaluationResult{isViolation: true, reason: "Contains abbreviation: " + token + "."} + } + + return evaluationResult{isViolation: true, reason: "Term not found in dictionary: " + token + "."} + } + + return evaluationResult{} +} + +func isUpperCaseToken(identifierName string, token string) bool { + tokenLength := utf8.RuneCountInString(token) + + if tokenLength < 2 || tokenLength > 8 { + return false + } + + return strings.Contains(identifierName, strings.ToUpper(token)) +} + +func tokenize(name string) []string { + name = strings.TrimSpace(name) + + if name == "" { + return nil + } + + parts := strings.FieldsFunc(name, func(r rune) bool { + return r == '_' || r == '-' || r == ' ' + }) + + if len(parts) == 0 { + return nil + } + + result := make([]string, 0, len(parts)*2) + + for _, part := range parts { + if part == "" { + continue + } + + result = append(result, splitCamel(part)...) + } + + return result +} + +func splitCamel(input string) []string { + if input == "" { + return nil + } + + runes := []rune(input) + + if len(runes) == 0 { + return nil + } + + tokens := make([]string, 0, 2) + start := 0 + + for index := 1; index < len(runes); index++ { + current := runes[index] + previous := runes[index-1] + next := rune(0) + + if index+1 < len(runes) { + next = runes[index+1] + } + + isBoundary := false + + if unicode.IsLower(previous) && unicode.IsUpper(current) { + isBoundary = true + } + + if unicode.IsDigit(previous) != unicode.IsDigit(current) { + isBoundary = true + } + + if unicode.IsUpper(previous) && unicode.IsUpper(current) && next != 0 && unicode.IsLower(next) { + isBoundary = true + } + + if isBoundary { + tokens = append(tokens, strings.ToLower(string(runes[start:index]))) + start = index + } + } + + tokens = append(tokens, strings.ToLower(string(runes[start:]))) + + return tokens +} + +func isDisallowedAbbreviation(token string, resources resources) bool { + _, hasExpansion := resources.dictionary.AbbreviationExpansion(token) + + return hasExpansion +} + +func isAlphabeticToken(token string) bool { + if token == "" { + return false + } + + for _, character := range token { + if !unicode.IsLetter(character) { + return false + } + } + + return true +} diff --git a/internal/analyze/analyze_test.go b/internal/analyze/analyze_test.go new file mode 100644 index 0000000..8aebf8d --- /dev/null +++ b/internal/analyze/analyze_test.go @@ -0,0 +1,174 @@ +package analyze_test + +import ( + "github.com/Fuwn/kivia/internal/analyze" + "github.com/Fuwn/kivia/internal/collect" + "os" + "path/filepath" + "testing" +) + +func dictionaryPathForTests(testingContext *testing.T) string { + testingContext.Helper() + + return filepath.Join("..", "..", "testdata", "dictionary", "words.txt") +} + +func TestAnalyzeFlagsAbbreviations(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + root := filepath.Join("..", "..", "testdata", "samplepkg") + identifiers, err := collect.FromPath(root) + + if err != nil { + testingContext.Fatalf("collect.FromPath returned an error: %v", err) + } + + result, err := analyze.Run(identifiers, analyze.Options{}) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) == 0 { + testingContext.Fatalf("Expected at least one violation, got none.") + } + + mustContainViolation(testingContext, result, "ctx") + mustContainViolation(testingContext, result, "userNum") + mustContainViolation(testingContext, result, "usr") +} + +func TestAnalyzeFlagsTechnicalTermsNotInDictionary(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + identifiers := []collect.Identifier{ + {Name: "userID", Kind: "variable"}, + {Name: "httpClient", Kind: "variable"}, + } + result, err := analyze.Run(identifiers, analyze.Options{}) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) == 0 { + testingContext.Fatalf("Expected violations, got none.") + } + + mustContainViolation(testingContext, result, "userID") + mustContainViolation(testingContext, result, "httpClient") +} + +func TestAnalyzeDoesNotFlagNormalDictionaryWords(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + identifiers := []collect.Identifier{ + {Name: "options", Kind: "variable"}, + {Name: "parsedResource", Kind: "variable"}, + {Name: "hasResources", Kind: "variable"}, + {Name: "allowlist", Kind: "variable"}, + } + result, err := analyze.Run(identifiers, analyze.Options{}) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) != 0 { + testingContext.Fatalf("Expected no violations, got %d.", len(result.Violations)) + } +} + +func TestAnalyzeMinEvaluationLengthSkipsSingleLetterIdentifiers(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + identifiers := []collect.Identifier{ + {Name: "t", Kind: "parameter"}, + {Name: "v", Kind: "receiver"}, + {Name: "ctx", Kind: "parameter"}, + } + result, err := analyze.Run(identifiers, analyze.Options{ + MinEvaluationLength: 2, + }) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) != 1 { + testingContext.Fatalf("Expected one violation, got %d.", len(result.Violations)) + } + + if result.Violations[0].Identifier.Name != "ctx" { + testingContext.Fatalf("Expected only ctx to be evaluated, got %q.", result.Violations[0].Identifier.Name) + } +} + +func TestAnalyzeFlagsExpressionAbbreviation(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + identifiers := []collect.Identifier{ + {Name: "expr", Kind: "variable"}, + } + result, err := analyze.Run(identifiers, analyze.Options{ + MinEvaluationLength: 1, + }) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) != 1 { + testingContext.Fatalf("Expected one violation, got %d.", len(result.Violations)) + } + + if result.Violations[0].Identifier.Name != "expr" { + testingContext.Fatalf("Expected expr to be flagged, got %q.", result.Violations[0].Identifier.Name) + } +} + +func TestAnalyzeAllowsUpperCaseTokens(testingContext *testing.T) { + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPathForTests(testingContext)) + + identifiers := []collect.Identifier{ + {Name: "JSON", Kind: "variable"}, + } + result, err := analyze.Run(identifiers, analyze.Options{}) + + if err != nil { + testingContext.Fatalf("analyze.Run returned an error: %v", err) + } + + if len(result.Violations) != 0 { + testingContext.Fatalf("Expected no violations, got %d.", len(result.Violations)) + } +} + +func TestAnalyzeFailsWhenDictionaryIsUnavailable(testingContext *testing.T) { + emptyDictionaryPath := filepath.Join(testingContext.TempDir(), "empty.txt") + + if err := os.WriteFile(emptyDictionaryPath, []byte("\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", emptyDictionaryPath) + + _, err := analyze.Run([]collect.Identifier{{Name: "ctx", Kind: "parameter"}}, analyze.Options{}) + + if err == nil { + testingContext.Fatalf("Expected analyze.Run to fail when dictionary data is unavailable.") + } +} + +func mustContainViolation(testingContext *testing.T, result analyze.Result, name string) { + testingContext.Helper() + + for _, violation := range result.Violations { + if violation.Identifier.Name == name { + return + } + } + + testingContext.Fatalf("Expected a violation for %q.", name) +} diff --git a/internal/analyze/resources.go b/internal/analyze/resources.go new file mode 100644 index 0000000..f42c757 --- /dev/null +++ b/internal/analyze/resources.go @@ -0,0 +1,26 @@ +package analyze + +import ( + "fmt" + "github.com/Fuwn/kivia/internal/nlp" +) + +type resources struct { + dictionary *nlp.Dictionary +} + +func getResources() (resources, error) { + return loadResources() +} + +func loadResources() (resources, error) { + dictionary, err := nlp.NewDictionary() + + if err != nil { + return resources{}, fmt.Errorf("Failed to load dictionary: %w", err) + } + + return resources{ + dictionary: dictionary, + }, nil +} diff --git a/internal/collect/collect.go b/internal/collect/collect.go new file mode 100644 index 0000000..ccb3b46 --- /dev/null +++ b/internal/collect/collect.go @@ -0,0 +1,331 @@ +package collect + +import ( + "bytes" + "fmt" + "go/ast" + "go/parser" + "go/printer" + "go/token" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" +) + +type Context struct { + EnclosingFunction string `json:"enclosingFunction,omitempty"` + Type string `json:"type,omitempty"` + ValueExpression string `json:"valueExpression,omitempty"` + ParentType string `json:"parentType,omitempty"` +} + +type Identifier struct { + Name string `json:"name"` + Kind string `json:"kind"` + File string `json:"file"` + Line int `json:"line"` + Column int `json:"column"` + Context Context `json:"context"` +} + +func FromPath(path string) ([]Identifier, error) { + files, err := discoverFiles(path) + + if err != nil { + return nil, err + } + + fileSet := token.NewFileSet() + identifiers := make([]Identifier, 0, 128) + + for _, filePath := range files { + fileNode, parseErr := parser.ParseFile(fileSet, filePath, nil, parser.SkipObjectResolution) + + if parseErr != nil { + return nil, fmt.Errorf("Failed to parse %s: %w", filePath, parseErr) + } + + collector := visitor{ + fileSet: fileSet, + file: filePath, + } + + ast.Walk(&collector, fileNode) + + identifiers = append(identifiers, collector.identifiers...) + } + + return identifiers, nil +} + +type visitor struct { + fileSet *token.FileSet + file string + identifiers []Identifier + functionStack []string + typeStack []string +} + +func (identifierVisitor *visitor) Visit(node ast.Node) ast.Visitor { + switch typedNode := node.(type) { + case *ast.FuncDecl: + identifierVisitor.addIdentifier(typedNode.Name, "function", Context{}) + + identifierVisitor.functionStack = append(identifierVisitor.functionStack, typedNode.Name.Name) + + identifierVisitor.captureFieldList(typedNode.Recv, "receiver") + identifierVisitor.captureFieldList(typedNode.Type.Params, "parameter") + identifierVisitor.captureFieldList(typedNode.Type.Results, "result") + + return leaveScope(identifierVisitor, func() { + identifierVisitor.functionStack = identifierVisitor.functionStack[:len(identifierVisitor.functionStack)-1] + }) + case *ast.TypeSpec: + identifierVisitor.addIdentifier(typedNode.Name, "type", Context{}) + + identifierVisitor.typeStack = append(identifierVisitor.typeStack, typedNode.Name.Name) + + identifierVisitor.captureTypeMembers(typedNode.Name.Name, typedNode.Type) + + return leaveScope(identifierVisitor, func() { identifierVisitor.typeStack = identifierVisitor.typeStack[:len(identifierVisitor.typeStack)-1] }) + case *ast.ValueSpec: + declaredType := renderExpression(identifierVisitor.fileSet, typedNode.Type) + rightHandValue := renderExpressionList(identifierVisitor.fileSet, typedNode.Values) + + for _, name := range typedNode.Names { + identifierVisitor.addIdentifier(name, "variable", Context{Type: declaredType, ValueExpression: rightHandValue}) + } + case *ast.AssignStmt: + if typedNode.Tok != token.DEFINE { + break + } + + rightHandValue := renderExpressionList(identifierVisitor.fileSet, typedNode.Rhs) + + for index, left := range typedNode.Lhs { + identifierNode, ok := left.(*ast.Ident) + + if !ok { + continue + } + + assignmentContext := Context{ValueExpression: rightHandValue} + + if index < len(typedNode.Rhs) { + assignmentContext.Type = inferTypeFromExpression(typedNode.Rhs[index]) + } + + identifierVisitor.addIdentifier(identifierNode, "variable", assignmentContext) + } + case *ast.RangeStmt: + if typedNode.Tok != token.DEFINE { + break + } + + if keyIdentifier, ok := typedNode.Key.(*ast.Ident); ok { + identifierVisitor.addIdentifier(keyIdentifier, "rangeKey", Context{ValueExpression: renderExpression(identifierVisitor.fileSet, typedNode.X)}) + } + + if valueIdentifier, ok := typedNode.Value.(*ast.Ident); ok { + identifierVisitor.addIdentifier(valueIdentifier, "rangeValue", Context{ValueExpression: renderExpression(identifierVisitor.fileSet, typedNode.X)}) + } + } + + return identifierVisitor +} + +type scopeExit struct { + parent *visitor + onLeave func() +} + +func leaveScope(parent *visitor, onLeave func()) ast.Visitor { + return &scopeExit{parent: parent, onLeave: onLeave} +} + +func (scopeExitVisitor *scopeExit) Visit(node ast.Node) ast.Visitor { + if node == nil { + scopeExitVisitor.onLeave() + + return nil + } + + return scopeExitVisitor.parent +} + +func (identifierVisitor *visitor) captureFieldList(fields *ast.FieldList, kind string) { + if fields == nil { + return + } + + for _, field := range fields.List { + declaredType := renderExpression(identifierVisitor.fileSet, field.Type) + + for _, name := range field.Names { + identifierVisitor.addIdentifier(name, kind, Context{Type: declaredType}) + } + } +} + +func (identifierVisitor *visitor) captureTypeMembers(typeName string, typeExpression ast.Expr) { + switch typedType := typeExpression.(type) { + case *ast.StructType: + if typedType.Fields == nil { + return + } + + for _, field := range typedType.Fields.List { + memberType := renderExpression(identifierVisitor.fileSet, field.Type) + + for _, fieldName := range field.Names { + identifierVisitor.addIdentifier(fieldName, "field", Context{Type: memberType, ParentType: typeName}) + } + } + case *ast.InterfaceType: + if typedType.Methods == nil { + return + } + + for _, method := range typedType.Methods.List { + memberType := renderExpression(identifierVisitor.fileSet, method.Type) + + for _, methodName := range method.Names { + identifierVisitor.addIdentifier(methodName, "interfaceMethod", Context{Type: memberType, ParentType: typeName}) + } + } + } +} + +func (identifierVisitor *visitor) addIdentifier(identifier *ast.Ident, kind string, context Context) { + if identifier == nil || identifier.Name == "_" { + return + } + + position := identifierVisitor.fileSet.Position(identifier.NamePos) + context.EnclosingFunction = currentFunction(identifierVisitor.functionStack) + identifierVisitor.identifiers = append(identifierVisitor.identifiers, Identifier{ + Name: identifier.Name, + Kind: kind, + File: identifierVisitor.file, + Line: position.Line, + Column: position.Column, + Context: context, + }) +} + +func currentFunction(stack []string) string { + if len(stack) == 0 { + return "" + } + + return stack[len(stack)-1] +} + +func discoverFiles(path string) ([]string, error) { + searchRoot := path + recursive := false + + if strings.HasSuffix(path, "/...") { + searchRoot = strings.TrimSuffix(path, "/...") + recursive = true + } + + if searchRoot == "" { + searchRoot = "." + } + + pathFileDetails, err := os.Stat(searchRoot) + + if err != nil { + return nil, err + } + + if !pathFileDetails.IsDir() { + if strings.HasSuffix(searchRoot, ".go") { + return []string{searchRoot}, nil + } + + return nil, fmt.Errorf("Path %q is not a Go file.", searchRoot) + } + + files := make([]string, 0, 64) + walkErr := filepath.WalkDir(searchRoot, func(candidate string, entry fs.DirEntry, walkError error) error { + if walkError != nil { + return walkError + } + + if entry.IsDir() { + name := entry.Name() + + if name == ".git" || name == "vendor" || name == "node_modules" { + return filepath.SkipDir + } + + if !recursive && candidate != searchRoot { + return filepath.SkipDir + } + + return nil + } + + if strings.HasSuffix(candidate, ".go") { + files = append(files, candidate) + } + + return nil + }) + + if walkErr != nil { + return nil, walkErr + } + + sort.Strings(files) + + return files, nil +} + +func renderExpression(fileSet *token.FileSet, expression ast.Expr) string { + if expression == nil { + return "" + } + + var buffer bytes.Buffer + + if err := printer.Fprint(&buffer, fileSet, expression); err != nil { + return "" + } + + return buffer.String() +} + +func renderExpressionList(fileSet *token.FileSet, expressions []ast.Expr) string { + if len(expressions) == 0 { + return "" + } + + parts := make([]string, 0, len(expressions)) + + for _, expression := range expressions { + parts = append(parts, renderExpression(fileSet, expression)) + } + + return strings.Join(parts, ", ") +} + +func inferTypeFromExpression(expression ast.Expr) string { + switch typedExpression := expression.(type) { + case *ast.CallExpr: + switch functionExpression := typedExpression.Fun.(type) { + case *ast.Ident: + return functionExpression.Name + case *ast.SelectorExpr: + return functionExpression.Sel.Name + } + + return "" + default: + return "" + } +} diff --git a/internal/nlp/dictionary.go b/internal/nlp/dictionary.go new file mode 100644 index 0000000..e7db37e --- /dev/null +++ b/internal/nlp/dictionary.go @@ -0,0 +1,614 @@ +package nlp + +import ( + "bufio" + "errors" + "fmt" + "github.com/sajari/fuzzy" + "os" + "path" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "unicode/utf8" +) + +var wordPattern = regexp.MustCompile(`[A-Za-z]+`) + +type Dictionary struct { + model *fuzzy.Model + words map[string]struct{} + wordsByFirstCharacter map[rune][]string +} + +func NewDictionary() (*Dictionary, error) { + words, err := loadWords() + + if err != nil { + return nil, err + } + + wordSet := makeWordSet(words) + wordsByFirstCharacter := makeWordsByFirstCharacter(words) + model, loadErr := loadCachedModel() + + if loadErr == nil { + return &Dictionary{model: model, words: wordSet, wordsByFirstCharacter: wordsByFirstCharacter}, nil + } + + model = fuzzy.NewModel() + + model.SetThreshold(1) + model.SetDepth(1) + model.SetUseAutocomplete(false) + model.Train(words) + + _ = saveCachedModel(model) + + return &Dictionary{model: model, words: wordSet, wordsByFirstCharacter: wordsByFirstCharacter}, nil +} + +func (dictionary *Dictionary) IsWord(token string) bool { + token = normalizeToken(token) + + if token == "" || dictionary == nil { + return false + } + + return dictionary.isLexiconWord(token) +} + +func (dictionary *Dictionary) Suggest(token string) string { + token = normalizeToken(token) + + if token == "" || dictionary == nil || dictionary.model == nil { + return "" + } + + if dictionary.isLexiconWord(token) { + return "" + } + + suggestions := dictionary.model.SpellCheckSuggestions(token, 1) + + if len(suggestions) == 0 { + return "" + } + + if suggestions[0] == token { + return "" + } + + return suggestions[0] +} + +func (dictionary *Dictionary) isLexiconWord(token string) bool { + if dictionary == nil { + return false + } + + if _, ok := dictionary.words[token]; ok { + return true + } + + candidates := make([]string, 0, 16) + candidates = append(candidates, inflectionCandidates(token)...) + candidates = append(candidates, spellingVariantCandidates(token)...) + + for _, candidate := range inflectionCandidates(token) { + candidates = append(candidates, spellingVariantCandidates(candidate)...) + } + + uniqueCandidates := make(map[string]struct{}, len(candidates)) + + for _, candidate := range candidates { + if candidate == "" || candidate == token { + continue + } + + if _, seen := uniqueCandidates[candidate]; seen { + continue + } + + uniqueCandidates[candidate] = struct{}{} + + if _, ok := dictionary.words[candidate]; ok { + return true + } + } + + return false +} + +func (dictionary *Dictionary) AbbreviationExpansion(token string) (string, bool) { + token = normalizeToken(token) + + if token == "" || dictionary == nil { + return "", false + } + + tokenLength := utf8.RuneCountInString(token) + + if tokenLength <= 1 || tokenLength > 4 { + return "", false + } + + firstCharacter, _ := utf8.DecodeRuneInString(token) + candidates := dictionary.wordsByFirstCharacter[firstCharacter] + + if len(candidates) == 0 { + return "", false + } + + bestCandidate := "" + bestScore := 1 << 30 + + for _, candidate := range candidates { + if !isLikelyAbbreviationForToken(token, candidate) { + continue + } + + score := abbreviationScore(token, candidate) + + if score < bestScore { + bestScore = score + bestCandidate = candidate + } + } + + if bestCandidate == "" { + return "", false + } + + return bestCandidate, true +} + +func isLikelyAbbreviationForToken(token string, candidate string) bool { + if candidate == "" || token == "" || token == candidate { + return false + } + + tokenLength := utf8.RuneCountInString(token) + candidateLength := utf8.RuneCountInString(candidate) + + if candidateLength <= tokenLength { + return false + } + + if !isSubsequence(token, candidate) { + return false + } + + if strings.HasPrefix(candidate, token) && tokenLength <= 4 { + return true + } + + tokenConsonants := consonantSkeleton(token) + candidateConsonants := consonantSkeleton(candidate) + + if tokenConsonants == "" || candidateConsonants == "" { + return false + } + + if isSubsequence(tokenConsonants, candidateConsonants) && tokenLength <= 5 { + return true + } + + return false +} + +func abbreviationScore(token string, candidate string) int { + tokenLength := utf8.RuneCountInString(token) + candidateLength := utf8.RuneCountInString(candidate) + lengthGap := max(candidateLength-tokenLength, 0) + score := lengthGap * 10 + + if strings.HasPrefix(candidate, token) { + score -= 3 + } + + return score +} + +func isSubsequence(shorter string, longer string) bool { + shorterRunes := []rune(shorter) + longerRunes := []rune(longer) + shorterIndex := 0 + + for _, character := range longerRunes { + if shorterIndex >= len(shorterRunes) { + break + } + + if shorterRunes[shorterIndex] == character { + shorterIndex++ + } + } + + return shorterIndex == len(shorterRunes) +} + +func consonantSkeleton(word string) string { + var builder strings.Builder + + for _, character := range word { + switch character { + case 'a', 'e', 'i', 'o', 'u': + continue + default: + builder.WriteRune(character) + } + } + + return builder.String() +} + +func inflectionCandidates(token string) []string { + candidates := make([]string, 0, 8) + + if strings.HasSuffix(token, "ies") && len(token) > 3 { + candidates = append(candidates, token[:len(token)-3]+"y") + } + + if strings.HasSuffix(token, "es") && len(token) > 2 { + candidates = append(candidates, token[:len(token)-2]) + } + + if strings.HasSuffix(token, "s") && len(token) > 1 { + candidates = append(candidates, token[:len(token)-1]) + } + + if strings.HasSuffix(token, "ed") && len(token) > 2 { + candidateWithoutSuffix := token[:len(token)-2] + candidates = append(candidates, candidateWithoutSuffix) + candidates = append(candidates, candidateWithoutSuffix+"e") + + if len(candidateWithoutSuffix) >= 2 { + lastCharacter := candidateWithoutSuffix[len(candidateWithoutSuffix)-1] + secondToLastCharacter := candidateWithoutSuffix[len(candidateWithoutSuffix)-2] + + if lastCharacter == secondToLastCharacter { + candidates = append(candidates, candidateWithoutSuffix[:len(candidateWithoutSuffix)-1]) + } + } + } + + if strings.HasSuffix(token, "ing") && len(token) > 3 { + candidateWithoutSuffix := token[:len(token)-3] + candidates = append(candidates, candidateWithoutSuffix) + candidates = append(candidates, candidateWithoutSuffix+"e") + } + + if strings.HasSuffix(token, "er") && len(token) > 2 { + candidateWithoutSuffix := token[:len(token)-2] + candidates = append(candidates, candidateWithoutSuffix) + candidates = append(candidates, candidateWithoutSuffix+"e") + + if len(candidateWithoutSuffix) >= 2 { + lastCharacter := candidateWithoutSuffix[len(candidateWithoutSuffix)-1] + secondToLastCharacter := candidateWithoutSuffix[len(candidateWithoutSuffix)-2] + + if lastCharacter == secondToLastCharacter { + candidates = append(candidates, candidateWithoutSuffix[:len(candidateWithoutSuffix)-1]) + } + } + } + + if strings.HasSuffix(token, "ize") && len(token) > 3 { + candidates = append(candidates, token[:len(token)-3]) + } + + if strings.HasSuffix(token, "ized") && len(token) > 4 { + candidates = append(candidates, token[:len(token)-4]) + } + + if strings.HasSuffix(token, "izing") && len(token) > 5 { + candidates = append(candidates, token[:len(token)-5]) + } + + if strings.HasSuffix(token, "izer") && len(token) > 4 { + candidates = append(candidates, token[:len(token)-4]) + } + + if strings.HasSuffix(token, "ization") && len(token) > 7 { + candidates = append(candidates, token[:len(token)-7]) + } + + return candidates +} + +func spellingVariantCandidates(token string) []string { + candidates := make([]string, 0, 8) + + appendSuffixVariant(&candidates, token, "isation", "ization") + appendSuffixVariant(&candidates, token, "ization", "isation") + appendSuffixVariant(&candidates, token, "ising", "izing") + appendSuffixVariant(&candidates, token, "izing", "ising") + appendSuffixVariant(&candidates, token, "ised", "ized") + appendSuffixVariant(&candidates, token, "ized", "ised") + appendSuffixVariant(&candidates, token, "iser", "izer") + appendSuffixVariant(&candidates, token, "izer", "iser") + appendSuffixVariant(&candidates, token, "ise", "ize") + appendSuffixVariant(&candidates, token, "ize", "ise") + appendSuffixVariant(&candidates, token, "our", "or") + appendSuffixVariant(&candidates, token, "or", "our") + appendSuffixVariant(&candidates, token, "tre", "ter") + appendSuffixVariant(&candidates, token, "ter", "tre") + + return candidates +} + +func appendSuffixVariant(candidates *[]string, token string, fromSuffix string, toSuffix string) { + if !strings.HasSuffix(token, fromSuffix) || len(token) <= len(fromSuffix) { + return + } + + root := token[:len(token)-len(fromSuffix)] + *candidates = append(*candidates, root+toSuffix) +} + +func makeWordSet(words []string) map[string]struct{} { + set := make(map[string]struct{}, len(words)) + + for _, word := range words { + set[word] = struct{}{} + } + + return set +} + +func makeWordsByFirstCharacter(words []string) map[rune][]string { + grouped := make(map[rune][]string) + + for _, word := range words { + firstCharacter, size := utf8.DecodeRuneInString(word) + + if firstCharacter == utf8.RuneError && size == 0 { + continue + } + + grouped[firstCharacter] = append(grouped[firstCharacter], word) + } + + for firstCharacter := range grouped { + sort.Strings(grouped[firstCharacter]) + } + + return grouped +} + +func loadWords() ([]string, error) { + configuredDictionaryPaths := parseDictionaryPaths(os.Getenv("KIVIA_DICTIONARY_PATH")) + + if len(configuredDictionaryPaths) > 0 { + words, err := loadWordsFromPaths(configuredDictionaryPaths, true) + + if err != nil { + return nil, err + } + + if len(words) == 0 { + return nil, errors.New("configured dictionary sources contain no usable words") + } + + return words, nil + } + + words, err := loadWordsFromPaths(defaultDictionaryPaths, false) + + if err != nil { + return nil, err + } + + if len(words) == 0 { + return nil, errors.New("no usable dictionary words found; set KIVIA_DICTIONARY_PATH") + } + + return words, nil +} + +func readWordsFromFile(filePath string) ([]string, error) { + file, err := os.Open(filePath) + + if err != nil { + return nil, err + } + + defer file.Close() + + words := make([]string, 0, 1024) + scanner := bufio.NewScanner(file) + isSpellDictionaryFile := strings.EqualFold(path.Ext(filePath), ".dic") + lineNumber := 0 + + for scanner.Scan() { + lineNumber++ + + line := normalizeDictionaryLine(scanner.Text(), lineNumber, isSpellDictionaryFile) + + if line == "" { + continue + } + + words = append(words, line) + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return normalizeWords(words), nil +} + +func parseDictionaryPaths(value string) []string { + trimmedValue := strings.TrimSpace(value) + + if trimmedValue == "" { + return nil + } + + expandedValue := strings.ReplaceAll(trimmedValue, ",", string(os.PathListSeparator)) + parts := strings.Split(expandedValue, string(os.PathListSeparator)) + paths := make([]string, 0, len(parts)) + + for _, entry := range parts { + candidate := strings.TrimSpace(entry) + + if candidate == "" { + continue + } + + paths = append(paths, candidate) + } + + return paths +} + +func loadWordsFromPaths(paths []string, strict bool) ([]string, error) { + combinedWords := make([]string, 0, 4096) + + for _, dictionaryPath := range paths { + words, err := readWordsFromFile(dictionaryPath) + + if err != nil { + if strict { + return nil, fmt.Errorf("failed to read dictionary %q: %w", dictionaryPath, err) + } + + continue + } + + combinedWords = append(combinedWords, words...) + } + + return normalizeWords(combinedWords), nil +} + +func normalizeDictionaryLine(line string, lineNumber int, isSpellDictionaryFile bool) string { + trimmedLine := strings.TrimSpace(line) + + if trimmedLine == "" || strings.HasPrefix(trimmedLine, "#") { + return "" + } + + if isSpellDictionaryFile && lineNumber == 1 { + if _, err := strconv.Atoi(trimmedLine); err == nil { + return "" + } + } + + if slashIndex := strings.Index(trimmedLine, "/"); slashIndex >= 0 { + trimmedLine = trimmedLine[:slashIndex] + } + + return trimmedLine +} + +func normalizeWords(words []string) []string { + unique := make(map[string]struct{}, len(words)) + + for _, word := range words { + normalized := normalizeToken(word) + + if normalized == "" { + continue + } + + if len(normalized) <= 1 { + continue + } + + unique[normalized] = struct{}{} + } + + output := make([]string, 0, len(unique)) + + for word := range unique { + output = append(output, word) + } + + sort.Strings(output) + + return output +} + +func normalizeToken(token string) string { + token = strings.ToLower(strings.TrimSpace(token)) + + if token == "" { + return "" + } + + match := wordPattern.FindString(token) + + if match == "" { + return "" + } + + return match +} + +func cachePath() (string, error) { + base, err := os.UserCacheDir() + + if err != nil { + return "", err + } + + return filepath.Join(base, "kivia", "fuzzy_model_v1.json"), nil +} + +func loadCachedModel() (*fuzzy.Model, error) { + path, err := cachePath() + + if err != nil { + return nil, err + } + + model, err := fuzzy.Load(path) + + if err != nil { + return nil, err + } + + return model, nil +} + +func saveCachedModel(model *fuzzy.Model) error { + if model == nil { + return errors.New("Model cannot be nil.") + } + + path, err := cachePath() + + if err != nil { + return err + } + + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + + return model.Save(path) +} + +var defaultDictionaryPaths = []string{ + "/usr/share/dict/words", + "/usr/dict/words", + "/usr/share/dict/web2", + "/usr/share/dict/web2a", + "/usr/share/dict/propernames", + "/usr/share/dict/connectives", + "/usr/share/hunspell/en_US.dic", + "/usr/share/hunspell/en_GB.dic", + "/usr/share/hunspell/en_CA.dic", + "/usr/share/hunspell/en_AU.dic", + "/usr/share/myspell/en_US.dic", + "/usr/share/myspell/en_GB.dic", + "/opt/homebrew/share/hunspell/en_US.dic", + "/opt/homebrew/share/hunspell/en_GB.dic", + "/usr/local/share/hunspell/en_US.dic", + "/usr/local/share/hunspell/en_GB.dic", +} diff --git a/internal/nlp/dictionary_test.go b/internal/nlp/dictionary_test.go new file mode 100644 index 0000000..c24e332 --- /dev/null +++ b/internal/nlp/dictionary_test.go @@ -0,0 +1,165 @@ +package nlp_test + +import ( + "github.com/Fuwn/kivia/internal/nlp" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestDictionaryRecognizesLexiconWords(testingContext *testing.T) { + dictionaryFile := filepath.Join("..", "..", "testdata", "dictionary", "words.txt") + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryFile) + + dictionary, err := nlp.NewDictionary() + + if err != nil { + testingContext.Fatalf("NewDictionary returned an error: %v", err) + } + + if !dictionary.IsWord("options") { + testingContext.Fatalf("Expected options to be recognized.") + } + + if !dictionary.IsWord("has") { + testingContext.Fatalf("Expected has to be recognized.") + } + + if !dictionary.IsWord("resources") { + testingContext.Fatalf("Expected resources to be recognized through plural inflection.") + } +} + +func TestDictionaryFindsAbbreviationExpansions(testingContext *testing.T) { + dictionaryFile := filepath.Join("..", "..", "testdata", "dictionary", "words.txt") + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryFile) + + dictionary, err := nlp.NewDictionary() + + if err != nil { + testingContext.Fatalf("NewDictionary returned an error: %v", err) + } + + cases := map[string]string{ + "expr": "expression", + "ctx": "context", + "err": "error", + } + + for token, expectedExpansion := range cases { + expansion, ok := dictionary.AbbreviationExpansion(token) + + if !ok { + testingContext.Fatalf("Expected an abbreviation expansion for %q.", token) + } + + if expansion != expectedExpansion { + testingContext.Fatalf("Expected %q to expand to %q, got %q.", token, expectedExpansion, expansion) + } + } +} + +func TestDictionaryLoadsFromMultipleDictionaryFiles(testingContext *testing.T) { + tempDirectory := testingContext.TempDir() + firstDictionaryPath := filepath.Join(tempDirectory, "first.txt") + secondDictionaryPath := filepath.Join(tempDirectory, "second.txt") + combinedPathList := strings.Join([]string{firstDictionaryPath, secondDictionaryPath}, string(os.PathListSeparator)) + + if err := os.WriteFile(firstDictionaryPath, []byte("alpha\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + if err := os.WriteFile(secondDictionaryPath, []byte("beta\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", combinedPathList) + + dictionary, err := nlp.NewDictionary() + + if err != nil { + testingContext.Fatalf("NewDictionary returned an error: %v", err) + } + + if !dictionary.IsWord("alpha") { + testingContext.Fatalf("Expected alpha to be recognized.") + } + + if !dictionary.IsWord("beta") { + testingContext.Fatalf("Expected beta to be recognized.") + } +} + +func TestDictionaryFailsWhenConfiguredPathHasNoWords(testingContext *testing.T) { + tempDirectory := testingContext.TempDir() + emptyDictionaryPath := filepath.Join(tempDirectory, "empty.txt") + + if err := os.WriteFile(emptyDictionaryPath, []byte("\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", emptyDictionaryPath) + + _, err := nlp.NewDictionary() + + if err == nil { + testingContext.Fatalf("Expected NewDictionary to fail when configured dictionary has no usable words.") + } +} + +func TestDictionaryRecognizesDerivedForms(testingContext *testing.T) { + tempDirectory := testingContext.TempDir() + dictionaryPath := filepath.Join(tempDirectory, "base_words.txt") + + if err := os.WriteFile(dictionaryPath, []byte("trim\ntoken\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPath) + + dictionary, err := nlp.NewDictionary() + + if err != nil { + testingContext.Fatalf("NewDictionary returned an error: %v", err) + } + + if !dictionary.IsWord("trimmed") { + testingContext.Fatalf("Expected trimmed to be recognized from trim.") + } + + if !dictionary.IsWord("tokenize") { + testingContext.Fatalf("Expected tokenize to be recognized from token.") + } +} + +func TestDictionaryRecognizesBritishAndAmericanVariants(testingContext *testing.T) { + tempDirectory := testingContext.TempDir() + dictionaryPath := filepath.Join(tempDirectory, "british_words.txt") + + if err := os.WriteFile(dictionaryPath, []byte("normalise\ncolour\ncentre\n"), 0o644); err != nil { + testingContext.Fatalf("os.WriteFile returned an error: %v", err) + } + + testingContext.Setenv("KIVIA_DICTIONARY_PATH", dictionaryPath) + + dictionary, err := nlp.NewDictionary() + + if err != nil { + testingContext.Fatalf("NewDictionary returned an error: %v", err) + } + + if !dictionary.IsWord("normalize") { + testingContext.Fatalf("Expected normalize to be recognized from normalise.") + } + + if !dictionary.IsWord("color") { + testingContext.Fatalf("Expected color to be recognized from colour.") + } + + if !dictionary.IsWord("center") { + testingContext.Fatalf("Expected center to be recognized from centre.") + } +} diff --git a/internal/report/report.go b/internal/report/report.go new file mode 100644 index 0000000..a97039e --- /dev/null +++ b/internal/report/report.go @@ -0,0 +1,80 @@ +package report + +import ( + "encoding/json" + "fmt" + "github.com/Fuwn/kivia/internal/analyze" + "github.com/Fuwn/kivia/internal/collect" + "io" + "strings" +) + +func Render(writer io.Writer, result analyze.Result, format string, includeContext bool) error { + switch strings.ToLower(format) { + case "json": + return renderJSON(writer, result, includeContext) + case "text", "": + return renderText(writer, result, includeContext) + default: + return fmt.Errorf("Unsupported output format %q. Use \"text\" or \"json\".", format) + } +} + +func renderText(writer io.Writer, result analyze.Result, includeContext bool) error { + if len(result.Violations) == 0 { + _, err := fmt.Fprintln(writer, "No naming violations found.") + + return err + } + + for _, violation := range result.Violations { + if _, err := fmt.Fprintf(writer, "%s:%d:%d %s %q: %s\n", + violation.Identifier.File, + violation.Identifier.Line, + violation.Identifier.Column, + violation.Identifier.Kind, + violation.Identifier.Name, + violation.Reason, + ); err != nil { + return err + } + + if includeContext { + contextParts := make([]string, 0, 3) + + if violation.Identifier.Context.Type != "" { + contextParts = append(contextParts, "type="+violation.Identifier.Context.Type) + } + + if violation.Identifier.Context.ValueExpression != "" { + contextParts = append(contextParts, "value="+violation.Identifier.Context.ValueExpression) + } + + if violation.Identifier.Context.EnclosingFunction != "" { + contextParts = append(contextParts, "function="+violation.Identifier.Context.EnclosingFunction) + } + + if len(contextParts) > 0 { + if _, err := fmt.Fprintf(writer, " context: %s\n", strings.Join(contextParts, ", ")); err != nil { + return err + } + } + } + } + + return nil +} + +func renderJSON(writer io.Writer, result analyze.Result, includeContext bool) error { + if !includeContext { + for index := range result.Violations { + result.Violations[index].Identifier.Context = collect.Context{} + } + } + + encoder := json.NewEncoder(writer) + + encoder.SetIndent("", " ") + + return encoder.Encode(result) +} |