githaven/modules/references/references.go
Oleksandr Redko 1761459ebc
Refactor to use UnsafeStringToBytes (#31358)
The PR replaces all `goldmark/util.BytesToReadOnlyString` with
`util.UnsafeBytesToString`, `goldmark/util.StringToReadOnlyBytes` with
`util.UnsafeStringToBytes`. This removes one `TODO`.

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
2024-06-14 01:26:33 +00:00

595 lines
19 KiB
Go

// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package references
import (
"bytes"
"net/url"
"regexp"
"strconv"
"strings"
"sync"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup/mdstripper"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)
var (
// validNamePattern performs only the most basic validation for user or repository names
// Repository name should contain only alphanumeric, dash ('-'), underscore ('_') and dot ('.') characters.
validNamePattern = regexp.MustCompile(`^[a-z0-9_.-]+$`)
// NOTE: All below regex matching do not perform any extra validation.
// Thus a link is produced even if the linked entity does not exist.
// While fast, this is also incorrect and lead to false positives.
// TODO: fix invalid linking issue
// mentionPattern matches all mentions in the form of "@user" or "@org/team"
mentionPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(@[-\w][-.\w]*?|@[-\w][-.\w]*?/[-\w][-.\w]*?)(?:\s|$|[:,;.?!](\s|$)|'|\)|\])`)
// issueNumericPattern matches string that references to a numeric issue, e.g. #1287
issueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[|\'|\")([#!][0-9]+)(?:\s|$|\)|\]|\'|\"|[:;,.?!]\s|[:;,.?!]$)`)
// issueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234
issueAlphanumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[|\"|\')([A-Z]{1,10}-[1-9][0-9]*)(?:\s|$|\)|\]|:|\.(\s|$)|\"|\')`)
// crossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository
// e.g. org/repo#12345
crossReferenceIssueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+[#!][0-9]+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`)
// crossReferenceCommitPattern matches a string that references a commit in a different repository
// e.g. go-gitea/gitea@d8a994ef, go-gitea/gitea@d8a994ef243349f321568f9e36d5c3f444b99cae (7-40 characters)
crossReferenceCommitPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-zA-Z-_\.]+)/([0-9a-zA-Z-_\.]+)@([0-9a-f]{7,64})(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`)
// spaceTrimmedPattern let's find the trailing space
spaceTrimmedPattern = regexp.MustCompile(`(?:.*[0-9a-zA-Z-_])\s`)
// timeLogPattern matches string for time tracking
timeLogPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(@([0-9]+([\.,][0-9]+)?(w|d|m|h))+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`)
issueCloseKeywordsPat, issueReopenKeywordsPat *regexp.Regexp
issueKeywordsOnce sync.Once
giteaHostInit sync.Once
giteaHost string
giteaIssuePullPattern *regexp.Regexp
actionStrings = []string{
"none",
"closes",
"reopens",
"neutered",
}
)
// XRefAction represents the kind of effect a cross reference has once is resolved
type XRefAction int64
const (
// XRefActionNone means the cross-reference is simply a comment
XRefActionNone XRefAction = iota // 0
// XRefActionCloses means the cross-reference should close an issue if it is resolved
XRefActionCloses // 1
// XRefActionReopens means the cross-reference should reopen an issue if it is resolved
XRefActionReopens // 2
// XRefActionNeutered means the cross-reference will no longer affect the source
XRefActionNeutered // 3
)
func (a XRefAction) String() string {
return actionStrings[a]
}
// IssueReference contains an unverified cross-reference to a local issue or pull request
type IssueReference struct {
Index int64
Owner string
Name string
Action XRefAction
TimeLog string
}
// RenderizableReference contains an unverified cross-reference to with rendering information
// The IsPull member means that a `!num` reference was used instead of `#num`.
// This kind of reference is used to make pulls available when an external issue tracker
// is used. Otherwise, `#` and `!` are completely interchangeable.
type RenderizableReference struct {
Issue string
Owner string
Name string
CommitSha string
IsPull bool
RefLocation *RefSpan
Action XRefAction
ActionLocation *RefSpan
}
type rawReference struct {
index int64
owner string
name string
isPull bool
action XRefAction
issue string
refLocation *RefSpan
actionLocation *RefSpan
timeLog string
}
func rawToIssueReferenceList(reflist []*rawReference) []IssueReference {
refarr := make([]IssueReference, len(reflist))
for i, r := range reflist {
refarr[i] = IssueReference{
Index: r.index,
Owner: r.owner,
Name: r.name,
Action: r.action,
TimeLog: r.timeLog,
}
}
return refarr
}
// RefSpan is the position where the reference was found within the parsed text
type RefSpan struct {
Start int
End int
}
func makeKeywordsPat(words []string) *regexp.Regexp {
acceptedWords := parseKeywords(words)
if len(acceptedWords) == 0 {
// Never match
return nil
}
return regexp.MustCompile(`(?i)(?:\s|^|\(|\[)(` + strings.Join(acceptedWords, `|`) + `):? $`)
}
func parseKeywords(words []string) []string {
acceptedWords := make([]string, 0, 5)
wordPat := regexp.MustCompile(`^[\pL]+$`)
for _, word := range words {
word = strings.ToLower(strings.TrimSpace(word))
// Accept Unicode letter class runes (a-z, á, à, ä, )
if wordPat.MatchString(word) {
acceptedWords = append(acceptedWords, word)
} else {
log.Info("Invalid keyword: %s", word)
}
}
return acceptedWords
}
func newKeywords() {
issueKeywordsOnce.Do(func() {
// Delay initialization until after the settings module is initialized
doNewKeywords(setting.Repository.PullRequest.CloseKeywords, setting.Repository.PullRequest.ReopenKeywords)
})
}
func doNewKeywords(close, reopen []string) {
issueCloseKeywordsPat = makeKeywordsPat(close)
issueReopenKeywordsPat = makeKeywordsPat(reopen)
}
// getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
func getGiteaHostName() string {
giteaHostInit.Do(func() {
if uapp, err := url.Parse(setting.AppURL); err == nil {
giteaHost = strings.ToLower(uapp.Host)
giteaIssuePullPattern = regexp.MustCompile(
`(\s|^|\(|\[)` +
regexp.QuoteMeta(strings.TrimSpace(setting.AppURL)) +
`([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+)/` +
`((?:issues)|(?:pulls))/([0-9]+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`)
} else {
giteaHost = ""
giteaIssuePullPattern = nil
}
})
return giteaHost
}
// getGiteaIssuePullPattern
func getGiteaIssuePullPattern() *regexp.Regexp {
getGiteaHostName()
return giteaIssuePullPattern
}
// FindAllMentionsMarkdown matches mention patterns in given content and
// returns a list of found unvalidated user names **not including** the @ prefix.
func FindAllMentionsMarkdown(content string) []string {
bcontent, _ := mdstripper.StripMarkdownBytes([]byte(content))
locations := FindAllMentionsBytes(bcontent)
mentions := make([]string, len(locations))
for i, val := range locations {
mentions[i] = string(bcontent[val.Start+1 : val.End])
}
return mentions
}
// FindAllMentionsBytes matches mention patterns in given content
// and returns a list of locations for the unvalidated user names, including the @ prefix.
func FindAllMentionsBytes(content []byte) []RefSpan {
// Sadly we can't use FindAllSubmatchIndex because our pattern checks for starting and
// trailing spaces (\s@mention,\s), so if we get two consecutive references, the space
// from the second reference will be "eaten" by the first one:
// ...\s@mention1\s@mention2\s... --> ...`\s@mention1\s`, (not) `@mention2,\s...`
ret := make([]RefSpan, 0, 5)
pos := 0
for {
match := mentionPattern.FindSubmatchIndex(content[pos:])
if match == nil {
break
}
ret = append(ret, RefSpan{Start: match[2] + pos, End: match[3] + pos})
notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos])
if notrail == nil {
pos = match[3] + pos
} else {
pos = match[3] + pos + notrail[1] - notrail[3]
}
}
return ret
}
// FindFirstMentionBytes matches the first mention in then given content
// and returns the location of the unvalidated user name, including the @ prefix.
func FindFirstMentionBytes(content []byte) (bool, RefSpan) {
mention := mentionPattern.FindSubmatchIndex(content)
if mention == nil {
return false, RefSpan{}
}
return true, RefSpan{Start: mention[2], End: mention[3]}
}
// FindAllIssueReferencesMarkdown strips content from markdown markup
// and returns a list of unvalidated references found in it.
func FindAllIssueReferencesMarkdown(content string) []IssueReference {
return rawToIssueReferenceList(findAllIssueReferencesMarkdown(content))
}
func findAllIssueReferencesMarkdown(content string) []*rawReference {
bcontent, links := mdstripper.StripMarkdownBytes([]byte(content))
return findAllIssueReferencesBytes(bcontent, links)
}
func convertFullHTMLReferencesToShortRefs(re *regexp.Regexp, contentBytes *[]byte) {
// We will iterate through the content, rewrite and simplify full references.
//
// We want to transform something like:
//
// this is a https://ourgitea.com/git/owner/repo/issues/123456789, foo
// https://ourgitea.com/git/owner/repo/pulls/123456789
//
// Into something like:
//
// this is a #123456789, foo
// !123456789
pos := 0
for {
// re looks for something like: (\s|^|\(|\[)https://ourgitea.com/git/(owner/repo)/(issues)/(123456789)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)
match := re.FindSubmatchIndex((*contentBytes)[pos:])
if match == nil {
break
}
// match is a bunch of indices into the content from pos onwards so
// to simplify things let's just add pos to all of the indices in match
for i := range match {
match[i] += pos
}
// match[0]-match[1] is whole string
// match[2]-match[3] is preamble
// move the position to the end of the preamble
pos = match[3]
// match[4]-match[5] is owner/repo
// now copy the owner/repo to end of the preamble
endPos := pos + match[5] - match[4]
copy((*contentBytes)[pos:endPos], (*contentBytes)[match[4]:match[5]])
// move the current position to the end of the newly copied owner/repo
pos = endPos
// Now set the issue/pull marker:
//
// match[6]-match[7] == 'issues'
(*contentBytes)[pos] = '#'
if string((*contentBytes)[match[6]:match[7]]) == "pulls" {
(*contentBytes)[pos] = '!'
}
pos++
// Then add the issue/pull number
//
// match[8]-match[9] is the number
endPos = pos + match[9] - match[8]
copy((*contentBytes)[pos:endPos], (*contentBytes)[match[8]:match[9]])
// Now copy what's left at the end of the string to the new end position
copy((*contentBytes)[endPos:], (*contentBytes)[match[9]:])
// now we reset the length
// our new section has length endPos - match[3]
// our old section has length match[9] - match[3]
*contentBytes = (*contentBytes)[:len(*contentBytes)-match[9]+endPos]
pos = endPos
}
}
// FindAllIssueReferences returns a list of unvalidated references found in a string.
func FindAllIssueReferences(content string) []IssueReference {
// Need to convert fully qualified html references to local system to #/! short codes
contentBytes := []byte(content)
if re := getGiteaIssuePullPattern(); re != nil {
convertFullHTMLReferencesToShortRefs(re, &contentBytes)
} else {
log.Debug("No GiteaIssuePullPattern pattern")
}
return rawToIssueReferenceList(findAllIssueReferencesBytes(contentBytes, []string{}))
}
// FindRenderizableReferenceNumeric returns the first unvalidated reference found in a string.
func FindRenderizableReferenceNumeric(content string, prOnly, crossLinkOnly bool) (bool, *RenderizableReference) {
var match []int
if !crossLinkOnly {
match = issueNumericPattern.FindStringSubmatchIndex(content)
}
if match == nil {
if match = crossReferenceIssueNumericPattern.FindStringSubmatchIndex(content); match == nil {
return false, nil
}
}
r := getCrossReference(util.UnsafeStringToBytes(content), match[2], match[3], false, prOnly)
if r == nil {
return false, nil
}
return true, &RenderizableReference{
Issue: r.issue,
Owner: r.owner,
Name: r.name,
IsPull: r.isPull,
RefLocation: r.refLocation,
Action: r.action,
ActionLocation: r.actionLocation,
}
}
// FindRenderizableCommitCrossReference returns the first unvalidated commit cross reference found in a string.
func FindRenderizableCommitCrossReference(content string) (bool, *RenderizableReference) {
m := crossReferenceCommitPattern.FindStringSubmatchIndex(content)
if len(m) < 8 {
return false, nil
}
return true, &RenderizableReference{
Owner: content[m[2]:m[3]],
Name: content[m[4]:m[5]],
CommitSha: content[m[6]:m[7]],
RefLocation: &RefSpan{Start: m[2], End: m[7]},
}
}
// FindRenderizableReferenceRegexp returns the first regexp unvalidated references found in a string.
func FindRenderizableReferenceRegexp(content string, pattern *regexp.Regexp) (bool, *RenderizableReference) {
match := pattern.FindStringSubmatchIndex(content)
if len(match) < 4 {
return false, nil
}
action, location := findActionKeywords([]byte(content), match[2])
return true, &RenderizableReference{
Issue: content[match[2]:match[3]],
RefLocation: &RefSpan{Start: match[0], End: match[1]},
Action: action,
ActionLocation: location,
IsPull: false,
}
}
// FindRenderizableReferenceAlphanumeric returns the first alphanumeric unvalidated references found in a string.
func FindRenderizableReferenceAlphanumeric(content string) (bool, *RenderizableReference) {
match := issueAlphanumericPattern.FindStringSubmatchIndex(content)
if match == nil {
return false, nil
}
action, location := findActionKeywords([]byte(content), match[2])
return true, &RenderizableReference{
Issue: content[match[2]:match[3]],
RefLocation: &RefSpan{Start: match[2], End: match[3]},
Action: action,
ActionLocation: location,
IsPull: false,
}
}
// FindAllIssueReferencesBytes returns a list of unvalidated references found in a byte slice.
func findAllIssueReferencesBytes(content []byte, links []string) []*rawReference {
ret := make([]*rawReference, 0, 10)
pos := 0
// Sadly we can't use FindAllSubmatchIndex because our pattern checks for starting and
// trailing spaces (\s#ref,\s), so if we get two consecutive references, the space
// from the second reference will be "eaten" by the first one:
// ...\s#ref1\s#ref2\s... --> ...`\s#ref1\s`, (not) `#ref2,\s...`
for {
match := issueNumericPattern.FindSubmatchIndex(content[pos:])
if match == nil {
break
}
if ref := getCrossReference(content, match[2]+pos, match[3]+pos, false, false); ref != nil {
ret = append(ret, ref)
}
notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos])
if notrail == nil {
pos = match[3] + pos
} else {
pos = match[3] + pos + notrail[1] - notrail[3]
}
}
pos = 0
for {
match := crossReferenceIssueNumericPattern.FindSubmatchIndex(content[pos:])
if match == nil {
break
}
if ref := getCrossReference(content, match[2]+pos, match[3]+pos, false, false); ref != nil {
ret = append(ret, ref)
}
notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos])
if notrail == nil {
pos = match[3] + pos
} else {
pos = match[3] + pos + notrail[1] - notrail[3]
}
}
localhost := getGiteaHostName()
for _, link := range links {
if u, err := url.Parse(link); err == nil {
// Note: we're not attempting to match the URL scheme (http/https)
host := strings.ToLower(u.Host)
if host != "" && host != localhost {
continue
}
parts := strings.Split(u.EscapedPath(), "/")
// /user/repo/issues/3
if len(parts) != 5 || parts[0] != "" {
continue
}
var sep string
if parts[3] == "issues" {
sep = "#"
} else if parts[3] == "pulls" {
sep = "!"
} else {
continue
}
// Note: closing/reopening keywords not supported with URLs
bytes := []byte(parts[1] + "/" + parts[2] + sep + parts[4])
if ref := getCrossReference(bytes, 0, len(bytes), true, false); ref != nil {
ref.refLocation = nil
ret = append(ret, ref)
}
}
}
if len(ret) == 0 {
return ret
}
pos = 0
for {
match := timeLogPattern.FindSubmatchIndex(content[pos:])
if match == nil {
break
}
timeLogEntry := string(content[match[2]+pos+1 : match[3]+pos])
var f *rawReference
for _, ref := range ret {
if ref.refLocation != nil && ref.refLocation.End < match[2]+pos && (f == nil || f.refLocation.End < ref.refLocation.End) {
f = ref
}
}
pos = match[1] + pos
if f == nil {
f = ret[0]
}
if len(f.timeLog) == 0 {
f.timeLog = timeLogEntry
}
}
return ret
}
func getCrossReference(content []byte, start, end int, fromLink, prOnly bool) *rawReference {
sep := bytes.IndexAny(content[start:end], "#!")
if sep < 0 {
return nil
}
isPull := content[start+sep] == '!'
if prOnly && !isPull {
return nil
}
repo := string(content[start : start+sep])
issue := string(content[start+sep+1 : end])
index, err := strconv.ParseInt(issue, 10, 64)
if err != nil {
return nil
}
if repo == "" {
if fromLink {
// Markdown links must specify owner/repo
return nil
}
action, location := findActionKeywords(content, start)
return &rawReference{
index: index,
action: action,
issue: issue,
isPull: isPull,
refLocation: &RefSpan{Start: start, End: end},
actionLocation: location,
}
}
parts := strings.Split(strings.ToLower(repo), "/")
if len(parts) != 2 {
return nil
}
owner, name := parts[0], parts[1]
if !validNamePattern.MatchString(owner) || !validNamePattern.MatchString(name) {
return nil
}
action, location := findActionKeywords(content, start)
return &rawReference{
index: index,
owner: owner,
name: name,
action: action,
issue: issue,
isPull: isPull,
refLocation: &RefSpan{Start: start, End: end},
actionLocation: location,
}
}
func findActionKeywords(content []byte, start int) (XRefAction, *RefSpan) {
newKeywords()
var m []int
if issueCloseKeywordsPat != nil {
m = issueCloseKeywordsPat.FindSubmatchIndex(content[:start])
if m != nil {
return XRefActionCloses, &RefSpan{Start: m[2], End: m[3]}
}
}
if issueReopenKeywordsPat != nil {
m = issueReopenKeywordsPat.FindSubmatchIndex(content[:start])
if m != nil {
return XRefActionReopens, &RefSpan{Start: m[2], End: m[3]}
}
}
return XRefActionNone, nil
}
// IsXrefActionable returns true if the xref action is actionable (i.e. produces a result when resolved)
func IsXrefActionable(ref *RenderizableReference, extTracker bool) bool {
if extTracker {
// External issues cannot be automatically closed
return false
}
return ref.Action == XRefActionCloses || ref.Action == XRefActionReopens
}