diff --git a/prreivew/Dockerfile b/pr_reivew/Dockerfile similarity index 100% rename from prreivew/Dockerfile rename to pr_reivew/Dockerfile diff --git a/prreivew/README.md b/pr_reivew/README.md similarity index 100% rename from prreivew/README.md rename to pr_reivew/README.md diff --git a/prreivew/config.yaml b/pr_reivew/config.yaml similarity index 30% rename from prreivew/config.yaml rename to pr_reivew/config.yaml index 97b6f41595e7f021d1fba6d351ec0167995b4484..92740293b95adbbd28e748e457351e2a45ee583c 100644 --- a/prreivew/config.yaml +++ b/pr_reivew/config.yaml @@ -1,5 +1,11 @@ access_token: gitee_host: -max_token_length: -encoding_name: \ No newline at end of file + +gpt: + max_token_length: + encoding_name: + url: + limit: + + diff --git a/prreivew/requirements.txt b/pr_reivew/requirements.txt similarity index 100% rename from prreivew/requirements.txt rename to pr_reivew/requirements.txt diff --git a/prreivew/src/app.py b/pr_reivew/src/app.py similarity index 100% rename from prreivew/src/app.py rename to pr_reivew/src/app.py diff --git a/prreivew/src/config/init_config.py b/pr_reivew/src/config/init_config.py similarity index 40% rename from prreivew/src/config/init_config.py rename to pr_reivew/src/config/init_config.py index acacb8226229cd32ceb799d8703b403981adfb2d..c29f1022b541e5cf6a73d45825c804aacd62db5b 100644 --- a/prreivew/src/config/init_config.py +++ b/pr_reivew/src/config/init_config.py @@ -1,27 +1,24 @@ -import os + import yaml -from gitee.gitee_api import GiteeApiCaller -from gpt.gpt import Gpt -from handle.diff import Diff_Prompt +from gitee.gitee_api import GiteeCaller +from gpt.bot import Gpt + def init_config(path): - print(os.getcwd()) with open(path, "r", encoding="utf-8") as f: config = yaml.safe_load(f) - GiteeApiCaller.init_config_attr(config["access_token"], config["gitee_host"]) + GiteeCaller.init_config_attr(config["access_token"], config["gitee_host"]) Gpt.init_config_attr( - config["gpt"]["use"], config["gpt"]["max_token_length"], config["gpt"]["encoding_name"], - config["gpt"]["host"], - config["gpt"]["Authorization"], + config["gpt"]["url"], + config["gpt"]["limit"], ) - Diff_Prompt.init_config_attr() diff --git a/prreivew/src/reviewCode/giteeApi.py b/pr_reivew/src/gitee/gitee_api.py similarity index 34% rename from prreivew/src/reviewCode/giteeApi.py rename to pr_reivew/src/gitee/gitee_api.py index 18063f87d7eb1df18bf2237dcf4a9553c5cb8ffd..aae6808f928e5f2e9ba448d7b2167c8aca656c5e 100644 --- a/prreivew/src/reviewCode/giteeApi.py +++ b/pr_reivew/src/gitee/gitee_api.py @@ -2,9 +2,17 @@ from loguru import logger import requests import json -class GiteeApi: + +class GiteeCaller: + access_token = '' + gitee_url = '' + + def init_config_attr(access_token, gitee_url): + GiteeCaller.access_token = access_token + GiteeCaller.gitee_url = gitee_url + +class GiteeApi(GiteeCaller): def __init__(self, data): - self.token = None self.cookies = None self.owner = None self.repo = None @@ -13,49 +21,60 @@ class GiteeApi: self.owner = project.get('namespace', None) self.repo = project.get('path', None) self.pr = data.get('pull_request', None) - self.prNumber = self.pr.get('number', None) - if (not self.owner) or (not self.repo) or (not self.pr) or (not self.prNumber): + self.pr_number = self.pr.get('number', None) + if (not self.owner) or (not self.repo) or (not self.pr) or (not self.pr_number): logger.error('not giteeApi param') - def listComments(self): - # 此处用的是pr中的comment,而不是issue中的comment + def list_comments(self): page = 1 - allComments = [] + all_comments = [] while True: - url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments?access_token={}&page={}&per_page=100&direction=desc'. \ - format(self.owner, self.repo, self.prNumber, self.token, page) - res = json.loads(requests.get(url = url).content.decode('utf-8')) - # res = requests.get(url = url).json() - allComments.extend(res) + url = f'{self.gitee_url}/{self.owner}/{self.repo}/pulls/{self.pr_number}/comments' + params = { + 'access_token': self.access_token, + 'page': page, + 'per_page': 100, + 'direction': 'desc' + } + res = json.loads(requests.get(url=url, params=params).content.decode('utf-8')) + all_comments.extend(res) page += 1 if not res or len(res) < 100: break - return allComments + return all_comments - def getAllCommitIds(self): - url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/commits?access_token={}'. \ - format(self.owner, self.repo, self.prNumber, self.token) - return json.loads(requests.get(url = url).content.decode('utf-8')) + def get_all_commit_ids(self): + url = f'{self.gitee_url}/{self.owner}/{self.repo}/pulls/{self.pr_number}/commits' + params = { + 'access_token': self.access_token + } + return json.loads(requests.get(url = url, params=params).content.decode('utf-8')) - def listReviewComments(self, prNumber): + def list_review_comments(self): page = 1 - allComments = [] + all_comments = [] while True: - url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments?access_token={}&page={}&per_page=100'. \ - format(self.owner, self.repo, prNumber, self.token, page) - res = json.loads(requests.get(url = url).content.decode('utf-8')) - allComments.extend(res) + url = f'{self.gitee_url}/{self.owner}/{self.repo}/pulls/{self.pr_number}/comments' + params = { + 'access_token': self.access_token, + 'page': page, + 'per_page': 100, + } + res = json.loads(requests.get(url = url, params=params).content.decode('utf-8')) + all_comments.extend(res) page += 1 if not res or len(res) < 100: break - return allComments + return all_comments - def submitReview(self, body, commitId): - url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(self.owner, self.repo, self.prNumber) + def submit_review(self, body, commitId, filename, line): + url = f'{self.gitee_url}/{self.owner}/{self.repo}/pulls/{self.pr_number}/comments' data = { - 'access_token': self.token, + 'access_token': self.access_token, 'body': body, - 'commit_id': commitId + 'commit_id': commitId, + "path": filename, + "position": line } res = requests.post(url = url, data = data) if res.status_code != 201: @@ -65,16 +84,18 @@ class GiteeApi: else: logger.info('post to gitee succeed') - def fetchPR(self): - url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}?access_token={}'. \ - format(self.owner, self.repo, self.prNumber, self.token) + def fetch_pr(self): + url = f'{self.gitee_url}/{self.owner}/{self.repo}/pulls/{self.pr_number}?access_token={self.access_token}' return json.loads(requests.get(url = url).content.decode('utf-8')) def compare(self, formerSha, latterSha): - url = 'https://gitee.com/api/v5/repos/{}/{}/compare/{}...{}?access_token={}&straight=true'. \ - format(self.owner, self.repo, formerSha, latterSha, self.token) - return json.loads(requests.get(url = url).content.decode('utf-8')) + url = f'{self.gitee_url}/{self.owner}/{self.repo}/compare/{formerSha}...{latterSha}' + params = { + 'access_token': self.access_token, + 'straight': True, + } + return json.loads(requests.get(url = url, params=params).content.decode('utf-8')) - def fetchFileContent(self, rawUrl): - url = '{}?access_token={}'.format(rawUrl, self.token) + def fetch_file_content(self, rawUrl): + url = f'{rawUrl}?access_token={self.access_token}' return requests.get(url, cookies = self.cookies).content.decode('utf-8') diff --git a/pr_reivew/src/gpt/bot.py b/pr_reivew/src/gpt/bot.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb89d8d29611aa72adb74288c7366e954cc4eae --- /dev/null +++ b/pr_reivew/src/gpt/bot.py @@ -0,0 +1,120 @@ +import requests +import tiktoken +from loguru import logger + + +class Gpt: + max_token_length = 0 + encoding_name = '' + url = '' + limit = 5 + + def init_config_attr(max_token_length, encoding_name, url, limit): + Gpt.max_token_length = max_token_length + Gpt.encoding_name = encoding_name + Gpt.url = url + Gpt.limit = limit + + +class Bot(Gpt): + def __init__(self): + self.system_message = ''' + Input: New hunks annotated with line numbers and old hunks (replaced code). Hunks represent incomplete code fragments. + Additional Context: PR title, description, summaries and comment chains. + Task: Review new hunks for substantive issues using provided context and respond with comments if necessary. + Output: Review comments in markdown with exact line number ranges in new hunks. Start and end line numbers must be within the same hunk. For single-line comments, start=end line number. Must use example response format below. + Use fenced code blocks using the relevant language identifier where applicable. + Don't annotate code snippets with line numbers. Format and indent code correctly. + Do not use `suggestion` code blocks. + For fixes, use `diff` code blocks, marking changes with `+` or `-`. The line number range for comments with fix snippets must exactly match the range to replace in the new hunk. + + - Do NOT provide general feedback, summaries, explanations of changes, or praises + for making good additions. + - Focus solely on offering specific, objective insights based on the + given context and refrain from making broad comments about potential impacts on + the system or question intentions behind the changes. + + If there are no issues found on a line range, you MUST respond with the + text `LGTM!` for that line range in the review section. + + ## Example + + ### Example changes + + ---new_hunk--- + ``` + z = x / y + return z + + 20: def add(x, y): + 21: z = x + y + 22: retrn z + 23: + 24: def multiply(x, y): + 25: return x * y + + def subtract(x, y): + z = x - y + ``` + + ---old_hunk--- + ``` + z = x / y + return z + + def add(x, y): + return x + y + + def subtract(x, y): + z = x - y + ``` + + ---comment_chains--- + ``` + Please review this change. + ``` + + ---end_change_section--- + + ### Example response + + 22-22: + 这里有语法错误 + ```diff + - retrn z + + return z + ``` + ''' + + + def chat(self, prompt): + data = { + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "system", + "content": self.system_message + }, + { + "role": "user", + "content": prompt + } + ], + "temperature": 0.7, + } + response = requests.post( + self.url, json=data + ) + if response.status_code != 200: + logger.info("get answer error") + logger.info(response.status_code) + + return response + + def get_token_count(self, content): + encoding = tiktoken.get_encoding(self.encoding_name) + tokens = encoding.encode(content) + return len(tokens) + + def get_max_prompt_length(self): + return self.max_token_length \ No newline at end of file diff --git a/prreivew/src/handle/pull_request.py b/pr_reivew/src/handle/pull_request.py similarity index 84% rename from prreivew/src/handle/pull_request.py rename to pr_reivew/src/handle/pull_request.py index c1bd92f3170c26b38bcc58a8f9da35ab33801305..c3f716ebe9f730395fb7c736dba1192424370914 100644 --- a/prreivew/src/handle/pull_request.py +++ b/pr_reivew/src/handle/pull_request.py @@ -1,9 +1,8 @@ from loguru import logger -from handle.comment_command import * -from reviewCode.main import reviewMain +from review_code.review_task import review_task -comment_method = {"/summary-message": summary_message, "@PRReviewAI reivew": reviewMain} +comment_method = {"@PRReviewAI reivew": review_task} def merge_request_hooks(data): diff --git a/prreivew/src/handle/task.py b/pr_reivew/src/handle/task.py similarity index 100% rename from prreivew/src/handle/task.py rename to pr_reivew/src/handle/task.py diff --git a/prreivew/src/main.py b/pr_reivew/src/main.py similarity index 83% rename from prreivew/src/main.py rename to pr_reivew/src/main.py index 72365c7f8b3a1f81903a9848d5dedc9a5a828cbc..ac01ca7b23c0bd038dd2ca2960e30e6a15464364 100644 --- a/prreivew/src/main.py +++ b/pr_reivew/src/main.py @@ -6,9 +6,7 @@ from config import init_config @click.command() @click.option("--config", default="config.yaml", help="config file path") def main(config): - # Init config from yaml file init_config.init_config(config) - # Set up routing router.start_router() diff --git a/prreivew/src/reviewCode/Untitled-1.txt b/pr_reivew/src/review_code/Untitled-1.txt similarity index 100% rename from prreivew/src/reviewCode/Untitled-1.txt rename to pr_reivew/src/review_code/Untitled-1.txt diff --git a/prreivew/src/reviewCode/__init__.py b/pr_reivew/src/review_code/__init__.py similarity index 100% rename from prreivew/src/reviewCode/__init__.py rename to pr_reivew/src/review_code/__init__.py diff --git a/prreivew/src/reviewCode/commenter.py b/pr_reivew/src/review_code/commenter.py similarity index 69% rename from prreivew/src/reviewCode/commenter.py rename to pr_reivew/src/review_code/commenter.py index a199a16440440f108adc434635ee79aa936e44c0..fbe49ee1455c3ff1ae79d1f1d1c015aba2bb4523 100644 --- a/prreivew/src/reviewCode/commenter.py +++ b/pr_reivew/src/review_code/commenter.py @@ -1,8 +1,6 @@ from loguru import logger -import requests -import chardet -import json -from reviewCode.giteeApi import GiteeApi +from gitee.gitee_api import GiteeApi + class Commenter: COMMENT_GREETING = '(PRReviewAI COMMENT_GREETING)' @@ -35,44 +33,18 @@ class Commenter: def __init__(self, giteeApi: GiteeApi): self.giteeApi = giteeApi - - # def comment(self, message, tag, mode): - # target = -1 - # pr_number = gitee_hook.get('number', None) - # if pr_number is not None: - # target = pr_number - # else: - # logger.error('not pull request number') - - # if tag is None: - # tag = Commenter.COMMENT_GREETING - - # body = '''%s\n%s\n%s'''%(Commenter.COMMENT_GREETING, message, tag) - - # if mode == 'create': - # self.create(body, target) - # elif mode == 'replace': - # self.replace(body, tag, target) - # else: - # logger.warning('unknown mode: %s, use \'replace\' instead') - # self.replace(body, tag, target) - def listComments(self, prNumber): + def list_comments(self): # 返回当前prnumber的所有commentR - # res = requests.get('https://gitee.com/api/v5/repos/ggzzll1/temp/pulls/2/comments?access_token=7e68411eb68f4f52834ed8510a0656ce&page=1&per_page=100&direction=desc') - # return json.loads(res.content.decode('utf-8') - return self.giteeApi.listComments() - - - - def findCommentWithTag(self, tag, target): - comments = self.listComments(target) + return self.giteeApi.list_comments() + + def findCommentWithTag(self, tag): + comments = self.list_comments() for comment in comments: if comment.get('body', None) and tag in comment.get('body', None): return comment return '' - - + def getContentWithinTags(self, content: str, startTag: str, endTag: str) -> str: start = content.find(startTag) end = content.find(endTag) @@ -92,8 +64,8 @@ class Commenter: return '' return commentBody[start : end + len(Commenter.COMMIT_ID_END_TAG)] - def getAllCommitIds(self): - commits = self.giteeApi.getAllCommitIds() + def get_all_commit_ids(self): + commits = self.giteeApi.get_all_commit_ids() allCommits = [] for commit in commits: allCommits.append(commit.get('sha', None)) @@ -113,8 +85,8 @@ class Commenter: ids = commentBody[start + len(Commenter.COMMIT_ID_START_TAG): end] return ids.split(',') - def getCommentChainsWithinRange(self, prNumber, path, startLine, endLine, tag = ''): - existingComments = self.getCommentsWithinRange(prNumber, path, startLine, endLine) + def getCommentChainsWithinRange(self, path, startLine, endLine, tag = ''): + existingComments = self.getCommentsWithinRange(path, startLine, endLine) topLevelComments = [] for comment in existingComments: if not comment.get('in_reply_to_id', None): @@ -135,17 +107,25 @@ class Commenter: conversationChain.append('{}: {}'.format(comment.get('user', None).get('login', None), comment.get('body', None))) return '\n---\n'.join(conversationChain) - def getCommentsWithinRange(self, prNumber, path, startLine, endLine): - comments = self.listReviewComments(prNumber) + def getCommentsWithinRange(self, path, startLine, endLine): + comments = self.list_review_comments() requiredComments = [] - for comment in comments: - if comment.get('path', None) == path and comment.get('body', None) and \ - comment.get('comment_type', None) == 'diff_comment' and startLine <= comment.get('new_line', None) <= endLine: - requiredComments.append(comment) + try: + for comment in comments: + if comment.get('path', None) == path and \ + comment.get('body', None) and \ + comment.get('comment_type', None) == 'diff_comment' and \ + comment.get('new_line') and \ + comment.get('new_line') >= startLine and \ + comment.get('line') and \ + comment.get('line') <= endLine: + requiredComments.append(comment) + except Exception as e: + logger.info('*******************************getCommentsWithinRange error = ', e) return requiredComments - def listReviewComments(self, prNumber): - return self.giteeApi.listReviewComments(prNumber) + def list_review_comments(self): + return self.giteeApi.list_review_comments() def addReviewedCommitId(self, commentBody, commitId): start = commentBody.find(Commenter.COMMIT_ID_START_TAG) diff --git a/prreivew/src/reviewCode/input.py b/pr_reivew/src/review_code/input.py similarity index 100% rename from prreivew/src/reviewCode/input.py rename to pr_reivew/src/review_code/input.py diff --git a/prreivew/src/reviewCode/options.py b/pr_reivew/src/review_code/options.py similarity index 92% rename from prreivew/src/reviewCode/options.py rename to pr_reivew/src/review_code/options.py index 654c936ae083f3cce997caac3ff73365a6f87ec3..09823ff93883bedaef1a4fa3b74a2d64de827733 100644 --- a/prreivew/src/reviewCode/options.py +++ b/pr_reivew/src/review_code/options.py @@ -6,7 +6,7 @@ class Options: self.maxFiles = 1000 self.pathFilters= '' self.TokenLimits = 1024 - self.rules = {'*.txt': False, '*.py': False} + self.rules = {'*.txt': False, '*.py': False, '*.md': False} self.debug = False def checkPath(self, path): diff --git a/prreivew/src/reviewCode/prompts.py b/pr_reivew/src/review_code/prompts.py similarity index 94% rename from prreivew/src/reviewCode/prompts.py rename to pr_reivew/src/review_code/prompts.py index e8e4a8e5b5386f7573c758729238791c3c5849ca..b41331a0c59ad843f34cf930232341b6d02e77ca 100644 --- a/prreivew/src/reviewCode/prompts.py +++ b/pr_reivew/src/review_code/prompts.py @@ -1,4 +1,4 @@ -from reviewCode.input import Input +from review_code.input import Input class Prompts: @@ -17,7 +17,7 @@ class Prompts: self.triageFileDiff = ''' Please triagle the diff as \'NEEDS_REVIEW\' or \'APPROVED\'. ''' - self.reviewFileDiff = ''' + self.reviewFileDiffOld = ''' Input: New hunks annotated with line numbers and old hunks (replaced code). Hunks represent incomplete code fragments. Additional Context: PR title, description, summaries and comment chains. Task: Review new hunks for substantive issues using provided context and respond with comments if necessary. @@ -92,7 +92,13 @@ class Prompts: $patches ''' + + self.reviewFileDiff = ''' + ## 请用中文对 `$filename` 的语法错误进行 review,并给出修改意见 + $patches + ''' + def renderSummarizeFileDiff(self, inputIn, reviewSimpleChanges): prompt = self.summarizeFileDiff if not reviewSimpleChanges: diff --git a/pr_reivew/src/review_code/review.py b/pr_reivew/src/review_code/review.py new file mode 100644 index 0000000000000000000000000000000000000000..f8d991e5611efeecaf9490394cf75c4075665837 --- /dev/null +++ b/pr_reivew/src/review_code/review.py @@ -0,0 +1,297 @@ +import copy +import re +from loguru import logger +from review_code.prompts import Prompts +from review_code.options import Options +from gitee.gitee_api import GiteeApi +from gpt.bot import Bot +from review_code.input import Input +from review_code.commenter import Commenter + + +IGNORE_KEYWORD = '@PRReviewAI: ignore' + + +class CodeReview: + def __init__(self, data): + self.options = Options() + self.prompts = Prompts() + self.bot = Bot() + self.giteeApi = GiteeApi(data) + self.input = Input(data) + self.commenter = Commenter(self.giteeApi) + self.commits = [] + + def code_review(self): + # 如果body中包含IGNORE_KEYWORD,跳过本次reivew + if (IGNORE_KEYWORD in self.input.description): + logger.info("skipped: body contains ignore_keyword") + return + + # 已经在pr中出现的comment + existingSummarizeComment = self.commenter.findCommentWithTag(Commenter.SUMMARIZE_TAG) + + existingCommitIdsBlock = '' + existingSummarizeCommentBody = '' + if existingSummarizeComment: + existingSummarizeCommentBody = existingSummarizeComment.get('body', None) + self.input.rawSummary = self.commenter.getRawSummary(existingSummarizeCommentBody) + self.input.shortSummary = self.commenter.getShortSummary(existingSummarizeCommentBody) + existingCommitIdsBlock = self.commenter.getReviewedCommitIdsBlock(existingSummarizeCommentBody) + + allCommitIds = self.commenter.get_all_commit_ids() + highrestReviewedCommitId = '' + if existingCommitIdsBlock: + highrestReviewedCommitId = self.commenter.getHighestReviewedCommitId(allCommitIds, self.commenter.getReviewedCommitIds(existingCommitIdsBlock)) + + # 获取PR + prs = self.giteeApi.fetch_pr() + # head:补丁分支,作者修改代码后提交PR对应的commit + # base:基准分支,接受修改的分支 + # PR:将补丁分支head中的代码合入基准分支base + headSha = None + baseSha = None + if prs.get('head', None): + headSha = prs.get('head', None).get('sha', None) + if prs.get('base', None): + baseSha = prs.get('base', None).get('sha', None) + + if (not highrestReviewedCommitId) or highrestReviewedCommitId == headSha: + logger.info('will review from the base commit: {}'.format(baseSha)) + highrestReviewedCommitId = baseSha + else: + logger.info('will review from commit: {}'.format(highrestReviewedCommitId)) + + # 比较highrestReviewedCommitId及headSha + incrementalDiff = self.giteeApi.compare(highrestReviewedCommitId, headSha) + targetBranchDiff = self.giteeApi.compare(baseSha, headSha) + + incrementalFiles = incrementalDiff.get('files', None) + targetBranchFiles = targetBranchDiff.get('files', None) + if (not incrementalFiles) and (not targetBranchFiles): + logger.warning('skipped: files data is missing') + return + + incrementalFilesNames = [] + for incrementalFile in incrementalFiles: + if incrementalFile.get('filename', None): + incrementalFilesNames.append(incrementalFile.get('filename')) + + files = [] + for targetBranchFile in targetBranchFiles: + if targetBranchFile.get('filename', None) in incrementalFilesNames: + files.append(targetBranchFile) + if len(files) == 0: + logger.warning('skipped: files is null') + return + + filterSelectedFiles = [] + filterIgnoredFiles = [] + for aFile in files: + if self.options.checkPath(aFile.get('filename', None)): + filterSelectedFiles.append(aFile) + else: + logger.info('skip for excluded path: %s'%(aFile.get('filename', None))) + filterIgnoredFiles.append(aFile) + if len(filterSelectedFiles) == 0: + logger.warning('skipped: filterSelectedFiles is null') + return + + # 获取本次pr的所有commitid + commits = [] + if incrementalDiff.get('commits', None): + for commit in incrementalDiff.get('commits', None): + commits.append(commit.get('sha', None)) + + if not commits: + logger.warning('skipped: commits is null') + return + self.commits = commits + + filteredFiles = [] + # 把patch切割成hunk + for aFile in filterSelectedFiles: + if not self.giteeApi.pr_number: + logger('skipped: pr is null') + continue + + fileContent = '' + try: + rawUrl = aFile.get('raw_url', None) + if rawUrl: + fileContent = self.giteeApi.fetch_file_content(rawUrl) + except Exception as e: + logger.warning('failed to get file contents: %s'%(e)) + + fileDiff = aFile.get('patch', '') + diff_lines = fileDiff.splitlines() + + patches = [] + diff_num = 0 + for patch in self.splitPatch(aFile.get('patch', '')): + diff_num += 1 + patchLines = self.patchStartEndLine(patch) + if not patchLines: + continue + hunks = self.parsePatch(patch) + if not hunks: + continue + + hunksStr = '''---new_hunk---\n\'\'\'\n%s\n\'\'\'\n---old_hunk---\n\'\'\'\n%s\n\'\'\''''%(hunks.get('newHunk', None), hunks.get('oldHunk', None)) + comment_diff_line = self.get_patch_diff_line(diff_num, diff_lines) + patches.append([patchLines.get('newHunk', None).get('startLine', None), patchLines.get('newHunk', None).get('endLine', None), hunksStr, comment_diff_line]) + if len(patches) > 0: + filteredFiles.append([aFile.get('filename', None), fileContent, fileDiff, patches]) + + filesAndChanges = filteredFiles + if len(filesAndChanges) == 0: + logger.error('skipped: no files to review') + return + + # 跳过summary,直接review + filesAndChangesReview = filesAndChanges + + for filename, fileContent, _, patches in filesAndChangesReview: + lgtm_num = self.do_review(filename, self.input, patches) + if lgtm_num == len(patches): + self.giteeApi.submit_review(body = "/lgtm", commitId = self.commits[0], filename=filename, line=patches[-1][-1]) + + def get_patch_diff_line(self, diff_num, diff_lines): + line_no = 0 + hit_no = 0 + for line in diff_lines: + if line.startswith('@@'): + if hit_no == diff_num: + return line_no + hit_no += 1 + line_no += 1 + return line_no + + def splitPatch(self, patch): + if not patch: + return [] + results = [] + splitLines = patch.split('\n') + # 去掉最后一行空格 + splitLines = splitLines[:-1] + lastLine = -1 + for iLine in range(len(splitLines)): + # 当前行数据格式是否满足:@@ -0,0 +0,0 @@ + reSplit = re.split('^@@ -(\d+),(\d+) \+(\d+),(\d+) @@', splitLines[iLine]) + if len(reSplit) > 1: + if lastLine == -1: + lastLine = iLine + else: + results.append('\n'.join(splitLines[lastLine: iLine])) + lastLine = iLine + if lastLine != -1: + results.append('\n'.join(splitLines[lastLine:])) + return results + + def patchStartEndLine(self, patch): + reSplit = re.split('^@@ -(\d+),(\d+) \+(\d+),(\d+) @@', patch) + if len(reSplit) > 1: + oldBegin = int(reSplit[1]) + oldDiff = int(reSplit[2]) + newBegin = int(reSplit[3]) + newDiff = int(reSplit[4]) + return {'oldHunk': {'startLine': oldBegin, 'endLine': oldDiff}, 'newHunk': {'startLine': newBegin, 'endLine': newDiff}} + else: + return None + + def parsePatch(self, patch): + hunkInfo = self.patchStartEndLine(patch) + if not hunkInfo: + return + oldHunkLines = [] + newHunkLines = [] + newLine = hunkInfo.get('newHunk', None).get('startLine', None) + # 去除第一行@@ + lines = patch.split('\n') [1:] + # 去除最后一行空格 + if lines[-1] == '': + lines = lines[:-1] + skipStart = 3 + skipEnd = 3 + currentLine = 0 + + # reamovalOnly=True代表只删除内容,没有新增内容 + removalOnly = True + for line in lines: + if line.startswith('+'): + removalOnly = False + break + + for line in lines: + currentLine += 1 + if line.startswith('-'): + oldHunkLines.append(line[1:]) + elif line.startswith('+'): + newHunkLines.append(line[1:]) + newLine += 1 + else: + oldHunkLines.append(line) + if removalOnly or (currentLine > skipStart and currentLine <= len(lines) - skipEnd): + newHunkLines.append(str(newLine) + ': ' + line) + else: + newHunkLines.append(line) + newLine += 1 + return {"oldHunk": '\n'.join(oldHunkLines), "newHunk": '\n'.join(newHunkLines)} + + def do_review(self, filename, input, patches): + logger.info('reviewing: {}'.format(filename)) + ins = copy.deepcopy(input) + ins.filename = filename + lgtm_num = 0 + + tokens = self.bot.get_token_count(self.bot.system_message) + + # 计算有多少个hunkstr可以放入prompt + # 当 prompt 的 token > max_token_length 时, 需要将 patch 切割吗? + patchesToPack = 0 + for _, _, patch, _ in patches: + patchTokens = self.bot.get_token_count(patch) + if tokens + patchTokens > self.bot.max_token_length: + logger.info('only packing {}/{} patches, tokens: {}/{}'.format(patchesToPack, len(patches), tokens, self.bot.max_token_length)) + break + # tokens += patchTokens + patchesToPack += 1 + + + patchesPacked = 0 + for startLine, endLine, patch, comment_line in patches: + if patchesPacked >= patchesToPack: + logger.info('unable to pack more patches into this request, packed: {}, total patches: {}, skipping'.format(patchesPacked, len(patches))) + if self.options.debug: + logger.info('prompt so far: {}'.format(self.prompts.renderReviewFileDiff(ins))) + break + patchesPacked += 1 + commentChain = '' + allChians = self.commenter.getCommentChainsWithinRange(filename, startLine, endLine, self.commenter.COMMENT_REPLY_TAG) + if len(allChians) > 0: + logger.info('Found comment chains: {} for {}'.format(allChians, filename)) + commentChain = allChians + commentChainTokens = self.bot.get_token_count(commentChain) + if tokens + commentChainTokens > self.bot.max_token_length: + commentChain = '' + else: + tokens += commentChainTokens + + ins.patches = patch + if commentChain: + ins.patches += '---comment_chains---\n\'\'\'{}\'\'\'---end_change_section---'.format(commentChain) + if patchesPacked > 0: + messages = self.prompts.renderReviewFileDiff(ins) + res = self.bot.chat(messages) + + if res.status_code != 200: + logger.info('review: nothing obtained from openai') + return '{} (no response)'.format(filename) + + ans = res.json() + if ('LGTM' not in ans): + self.giteeApi.submit_review(body = ans, commitId = self.commits[0], filename=filename, line=comment_line) + + else: + lgtm_num += 1 + return lgtm_num diff --git a/pr_reivew/src/review_code/review_task.py b/pr_reivew/src/review_code/review_task.py new file mode 100644 index 0000000000000000000000000000000000000000..2e974e4617178f6812b083e3002ab2af9fe5574b --- /dev/null +++ b/pr_reivew/src/review_code/review_task.py @@ -0,0 +1,9 @@ + +from review_code.review import CodeReview + + +def review_task(data): + + if data.get('noteable_type', None) == 'PullRequest': + codeReview = CodeReview(data) + codeReview.code_review() diff --git a/prreivew/src/router/router.py b/pr_reivew/src/router/router.py similarity index 55% rename from prreivew/src/router/router.py rename to pr_reivew/src/router/router.py index 86da14253e7c89de1b3de593183b57b765a874da..8221e8cc246e8eb414487475e6bb3b3783369dc0 100644 --- a/prreivew/src/router/router.py +++ b/pr_reivew/src/router/router.py @@ -20,15 +20,6 @@ def analyze(): @app.before_request def before_request(): headers = request.headers - # if headers.get("User-Agent") != "Robot-Gitee-Access": - # return "Bad Request: unknown User-Agent Header", 400 - - # if headers.get("X-Gitee-Event") == "": - # return "Bad Request: Missing X-Gitee-Event Header", 400 - - # uuid = headers.get("X-Gitee-Timestamp") - # if uuid == "": - # return "Bad Request: Missing X-Gitee-Timestamp Header", 400 def start_router(): diff --git a/prreivew/src/utils/background_task.py b/pr_reivew/src/utils/background_task.py similarity index 100% rename from prreivew/src/utils/background_task.py rename to pr_reivew/src/utils/background_task.py diff --git a/prreivew/src/utils/utile_tool.py b/pr_reivew/src/utils/utile_tool.py similarity index 100% rename from prreivew/src/utils/utile_tool.py rename to pr_reivew/src/utils/utile_tool.py diff --git a/prreivew/src/gitee/gitee_api.py b/prreivew/src/gitee/gitee_api.py deleted file mode 100644 index aa7866d7a128db74928651e44f334e283b0e2e03..0000000000000000000000000000000000000000 --- a/prreivew/src/gitee/gitee_api.py +++ /dev/null @@ -1,47 +0,0 @@ -import requests -from loguru import logger - - -class GiteeApiCaller: - access_token = "" - gitee_host = "" - - def init_config_attr(access_token, gitee_host): - GiteeApiCaller.access_token = access_token - GiteeApiCaller.gitee_host = gitee_host - - -class PullRequestComments(GiteeApiCaller): - submit_pull_request_comments_url_template = ( - "{host}/api/v5/repos/{owner}/{repo}/pulls/{number}/comments" - ) - - def __init__(self, owner, repo, number, body, commit_id, path, position): - self.owner = owner - self.repo = repo - self.number = number - self.body = body - self.commit_id = commit_id - self.path = path - self.position = position - - def submit_pull_request_comments(self): - url = self.submit_pull_request_comments_url_template.format( - host=self.gitee_host, owner=self.owner, repo=self.repo, number=self.number - ) - - form_data = { - "access_token": self.access_token, - "body": self.body, - "commit_id": self.commit_id, - "path": self.path, - "position": self.position, - } - response = requests.post(url, data=form_data) - - if response.status_code == 201: - logger.info("post to gitee success") - else: - logger.info("post to gitee failed") - logger.info(response.status_code) - logger.info(response.text) diff --git a/prreivew/src/gpt/chat_gpt.py b/prreivew/src/gpt/chat_gpt.py deleted file mode 100644 index d84cf44f7bdd7befddb8cc35a6b2bb54b5ee09fa..0000000000000000000000000000000000000000 --- a/prreivew/src/gpt/chat_gpt.py +++ /dev/null @@ -1,92 +0,0 @@ -import requests -import tiktoken - -from gpt.gpt import Gpt - - -class ChatGpt(Gpt): - def get_answer(prompt): - url = "{openai_host}/v1/chat/completions".format(openai_host=ChatGpt.host) - - data = { - "model": "gpt-3.5-turbo", - "messages": [ - { - "role": "system", - "content": ( - "您将充当 git 中提交消息的作者。" - "您的任务是在传统git提交中创建清晰且全面的提交消息,详细清晰的解释更改内容。 我将向您发送“git diff --staged”命令的输出,然后您将其转换为提交消息。" - "行长度不得超过 74 个字符。" - "用中文回答。" - "使用如下模板:" - "修改了那个文件\n" - "- 修改细节1\n" - "- 修改细节2\n" - ), - }, - { - "role": "user", - "content": prompt, - } - ], - "temperature": 0.7, - } - - - response = requests.post( - url, json=data, headers={"Authorization": "Bearer " + ChatGpt.Authorization} - ) - - if response.status_code != 200: - print("get answer error") - print(response.status_code) - - pr = response.json() - - return pr["choices"][0]["message"]["content"] - - - - def get_summary(content): - url = "{openai_host}/v1/chat/completions".format(openai_host=ChatGpt.host) - data = { - "model": "gpt-3.5-turbo", - "messages": [ - { - "role": "system", - "content": ( - "您的任务是高度概括总结我给您的输入内容。" - "用中文回答。" - ), - }, - { - "role": "user", - "content": content, - } - ], - "temperature": 0.7, - } - - response = requests.post( - url, json=data, headers={"Authorization": "Bearer " + ChatGpt.Authorization} - ) - - if response.status_code != 200: - print("get answer error") - print(response.status_code) - - pr = response.json() - - return pr["choices"][0]["message"]["content"] - - - - - def num_tokens_from_string(string: str) -> int: - encoding = tiktoken.get_encoding(ChatGpt.encoding_name) - tokens = encoding.encode(string) - num_tokens = len(tokens) - return num_tokens - - def get_max_prompt_length(): - return ChatGpt.max_token_length diff --git a/prreivew/src/gpt/gpt.py b/prreivew/src/gpt/gpt.py deleted file mode 100644 index 6d7cbc4495bef1c27bb6dfcf04d54811d1ad7d2f..0000000000000000000000000000000000000000 --- a/prreivew/src/gpt/gpt.py +++ /dev/null @@ -1,28 +0,0 @@ -from abc import ABCMeta, abstractmethod - - -class Gpt(metaclass=ABCMeta): - use = "" - max_token_length = 0 - encoding_name = "" - host = "" - Authorization = "" - - def init_config_attr(use, max_token_length, encoding_name, host, Authorization): - Gpt.use = use - Gpt.max_token_length = max_token_length - Gpt.encoding_name = encoding_name - Gpt.host = host - Gpt.Authorization = Authorization - - @abstractmethod - def get_answer(prompt): - pass - - @abstractmethod - def num_tokens_from_string(string: str) -> int: - pass - - @abstractmethod - def get_max_prompt_length(): - pass diff --git a/prreivew/src/gpt/gpt_class_factory.py b/prreivew/src/gpt/gpt_class_factory.py deleted file mode 100644 index 5a131c5afae5adf14b669617d8f0c047c2ccd736..0000000000000000000000000000000000000000 --- a/prreivew/src/gpt/gpt_class_factory.py +++ /dev/null @@ -1,14 +0,0 @@ -from gpt.chat_gpt import ChatGpt -from gpt.gpt import Gpt -from gpt.my_gpt import MyGpt - - -class GptClassFactory: - @staticmethod - def create_class(): - if Gpt.use == "my_gpt": - return MyGpt - elif Gpt.use == "open_ai": - return ChatGpt - else: - raise ValueError("Invalid class name") diff --git a/prreivew/src/gpt/my_gpt.py b/prreivew/src/gpt/my_gpt.py deleted file mode 100644 index dc16cbf9320fd515ba0cdb9e28cddfb3b56e9b60..0000000000000000000000000000000000000000 --- a/prreivew/src/gpt/my_gpt.py +++ /dev/null @@ -1,56 +0,0 @@ -import re -import requests -import tiktoken -from loguru import logger - -from gpt.gpt import Gpt - - -class MyGpt(Gpt): - question = ( - "You are to act as the author of a commit message in git." - "Your mission is to create clean and comprehensive commit messages in the conventional commit convention and explain WHAT were the changes and WHY the changes were done. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message." - "Do not preface the commit with anything." - "Don't add any descriptions to the commit, only commit message." - "Use the present tense. Lines must not be longer than 74 characters." - "Use Chinese to answer." - "The diff is: {diff_content}" - ) - - def get_answer(prompt): - url = "{host}/hcstream".format(host=MyGpt.host) - - data = {"question": MyGpt.question.format(diff_content=prompt), "history": []} - response = requests.post(url, json=data, stream=True) - if response.status_code != 200: - logger.error("get answer error") - logger.error(response.status_code) - return - - data_list = [] - pattern = r'"answer":\s+"([^"]+)"' - for line in response.iter_lines(): - if line: - line_text = line.decode("utf-8") - match = re.search(pattern, line_text) - if match: - result = match.group(1) - data_list.append(result) - - if len(data_list) < 2: - logger.info("no answer") - return - data_list.pop() - - combined_result = "".join(data_list) - - return combined_result - - def num_tokens_from_string(string: str) -> int: - encoding = tiktoken.get_encoding(MyGpt.encoding_name) - tokens = encoding.encode(string) - num_tokens = len(tokens) - return num_tokens - - def get_max_prompt_length(): - return MyGpt.max_token_length - MyGpt.num_tokens_from_string(MyGpt.question) diff --git a/prreivew/src/handle/comment_command.py b/prreivew/src/handle/comment_command.py deleted file mode 100644 index 4d6e08bf80127642433b671a39976df3e52a7e62..0000000000000000000000000000000000000000 --- a/prreivew/src/handle/comment_command.py +++ /dev/null @@ -1,78 +0,0 @@ -import time -from loguru import logger -import requests -from gitee.gitee_api import PullRequestComments -from gpt.gpt_class_factory import GptClassFactory - -from handle.diff import Diff_Prompt, handle_diff - - -def summary_message(data): - pr = data.get("pull_request", None) - if pr is None: - logger.error("no pull_request") - return - - diff_url = pr.get("diff_url", None) - if diff_url is None: - logger.error("no diff") - return - - diff = requests.get(diff_url) - if diff.status_code != 200: - logger.error("get diff error") - return - - diff.encoding = "utf-8" - diff_text = diff.text - - results = handle_diff(diff_text) - if results is None: - logger.error("can't get prompts") - return - - comment_list = [] - - - for result in results: - answer = GptClassFactory.create_class().get_answer(result) - if answer is None: - continue - comment_list.append(answer) - - time.sleep(10) - - comment = "" - - for single_comment in comment_list: - single_comment = str(single_comment).strip() - if single_comment == "": - continue - comment += single_comment + "\n\n" - - # summarize = GptClassFactory.create_class().get_summary(comment) - # comment += summarize - - project = data.get("project", None) - if project is None: - logger.error("no project") - return - - owner = project.get("namespace", None) - if owner is None: - logger.error("no owner") - return - - repo = project.get("path", None) - if repo is None: - logger.error("no repo") - return - - number = pr.get("number", None) - if number is None: - logger.error("no number") - return - - pr = PullRequestComments(owner, repo, number, comment, None, None, None) - - pr.submit_pull_request_comments() diff --git a/prreivew/src/handle/diff.py b/prreivew/src/handle/diff.py deleted file mode 100644 index da509bf25f263dc619112938ad30719c21ecc2a5..0000000000000000000000000000000000000000 --- a/prreivew/src/handle/diff.py +++ /dev/null @@ -1,93 +0,0 @@ -import re -from loguru import logger -from gpt.gpt_class_factory import GptClassFactory - -from utils.utile_tool import split_string - - -class Diff_Prompt: - max_template_token_length = 1024 - - def init_config_attr(): - Diff_Prompt.max_template_token_length = ( - GptClassFactory.create_class().get_max_prompt_length() - ) - - -def diff_content_out_of_length(diff_content): - return ( - GptClassFactory.create_class().num_tokens_from_string(diff_content) - >= Diff_Prompt.max_template_token_length - ) - - -def handle_diff(diff): - prompt_list = [] - - if diff_content_out_of_length(diff): - diff_array = cut_diff_by_file_diffs(diff) - - for single_diff in diff_array: - if diff_content_out_of_length(single_diff): - result_array = cut_single_diff(single_diff) - prompt_list.extend(result_array) - else: - prompt_list.append(single_diff) - - return prompt_list - - -def cut_diff_by_file_diffs(diff): - separator = "diff --git" - - diff_array = diff.split(separator) - diff_array.pop(0) - - diff_array = [separator + diff for diff in diff_array] - - return diff_array - - -def cut_one_diff_by_change(diff): - separator = "@@ -" - - change_array = diff.split(separator) - diff_title = change_array.pop(0) - - change_array = [separator + diff for diff in change_array] - - change_array[0] = diff_title + change_array[0] - - return change_array - - -def cut_single_diff(diff): - result_array = [] - change_array = cut_one_diff_by_change(diff) - - index = 0 - while index < len(change_array): - if diff_content_out_of_length(change_array[index]): - result_array.extend(cut_change(change_array[index])) - index += 1 - continue - - merge_change = change_array[index] - for i in range(index + 1, len(change_array)): - if diff_content_out_of_length(merge_change + change_array[i]): - index = i - break - else: - index = i + 1 - merge_change += change_array[i] - - result_array.append(merge_change) - - if index >= len(change_array) - 1: - break - - return result_array - - -def cut_change(change): - return split_string(change, Diff_Prompt.max_template_token_length) diff --git a/prreivew/src/reviewCode/bot.py b/prreivew/src/reviewCode/bot.py deleted file mode 100644 index 7a4f47c991e49c03f10f2319645888a078486f96..0000000000000000000000000000000000000000 --- a/prreivew/src/reviewCode/bot.py +++ /dev/null @@ -1,14 +0,0 @@ -import requests -from loguru import logger - -class Bot: - def __init__(self): - self.answer = '' - - - def chat(self, prompt): - url = 'https://modelapi.osinfra.cn/hcstream' - data = {'question': prompt, 'history': []} - response = requests.post(url, json = data, stream = True) - return response - \ No newline at end of file diff --git a/prreivew/src/reviewCode/main.py b/prreivew/src/reviewCode/main.py deleted file mode 100644 index c4afc1158f9e64a3fc213a2697683653ac509377..0000000000000000000000000000000000000000 --- a/prreivew/src/reviewCode/main.py +++ /dev/null @@ -1,15 +0,0 @@ -from reviewCode.review import codeReview -from reviewCode.options import Options -from reviewCode.prompts import Prompts -from reviewCode.giteeApi import GiteeApi -from reviewCode.bot import Bot - -def reviewMain(data): - # options = Options() - # prompts = Prompts() - # bot = Bot() - # giteeApi = GiteeApi() - - - if data.get('noteable_type', None) == 'PullRequest': - codeReview(data) diff --git a/prreivew/src/reviewCode/review.py b/prreivew/src/reviewCode/review.py deleted file mode 100644 index 6d620d5fede0cba40291c12af6a5bafadc37ca07..0000000000000000000000000000000000000000 --- a/prreivew/src/reviewCode/review.py +++ /dev/null @@ -1,316 +0,0 @@ -from loguru import logger -import json -import requests -# from handle.diff import Diff_prompt, handle_diff -from gpt.gpt import Gpt -import time -# from gitee.gitee_api import pull_request_comments -from diff import diff -from git.repo import Repo -import base64 -import re -import copy -from reviewCode.prompts import Prompts -from reviewCode.options import Options -from reviewCode.prompts import Prompts -from reviewCode.giteeApi import GiteeApi -from reviewCode.bot import Bot -from reviewCode.input import Input -from reviewCode.commenter import Commenter -from reviewCode.tokenizer import getTokenCount - -IGNORE_KEYWORD = '@PRReviewAI: ignore' - -def codeReview(data): - options = Options() - prompts = Prompts() - bot = Bot() - giteeApi = GiteeApi(data) - input = Input(data) - commenter = Commenter(giteeApi) - # 如果body中包含IGNORE_KEYWORD,跳过本次reivew - if (IGNORE_KEYWORD in input.description): - logger.info("skipped: body contains ignore_keyword") - return - - # 已经在pr中出现的comment - existingSummarizeComment = commenter.findCommentWithTag(Commenter.SUMMARIZE_TAG, giteeApi.prNumber) - existingCommitIdsBlock = '' - existingSummarizeCommentBody = '' - if existingSummarizeComment: - existingSummarizeCommentBody = existingSummarizeComment.get('body', None) - input.rawSummary = commenter.getRawSummary(existingSummarizeCommentBody) - input.shortSummary = commenter.getShortSummary(existingSummarizeCommentBody) - existingCommitIdsBlock = commenter.getReviewedCommitIdsBlock(existingSummarizeCommentBody) - - allCommitIds = commenter.getAllCommitIds() - highrestReviewedCommitId = '' - if existingCommitIdsBlock: - highrestReviewedCommitId = commenter.getHighestReviewedCommitId(allCommitIds, commenter.getReviewedCommitIds(existingCommitIdsBlock)) - - # 获取PR - prs = giteeApi.fetchPR() - # head:补丁分支,作者修改代码后提交PR对应的commit - # base:基准分支,接受修改的分支 - # PR:将补丁分支head中的代码合入基准分支base - headSha = None - baseSha = None - if prs.get('head', None): - headSha = prs.get('head', None).get('sha', None) - if prs.get('base', None): - baseSha = prs.get('base', None).get('sha', None) - - if (not highrestReviewedCommitId) or highrestReviewedCommitId == headSha: - logger.info('will review from the base commit: {}'.format(baseSha)) - highrestReviewedCommitId = baseSha - else: - logger.info('will review from commit: {}'.format(highrestReviewedCommitId)) - - # 比较highrestReviewedCommitId及headSha - incrementalDiff = giteeApi.compare(highrestReviewedCommitId, headSha) - targetBranchDiff = giteeApi.compare(baseSha, headSha) - - incrementalFiles = incrementalDiff.get('files', None) - targetBranchFiles = targetBranchDiff.get('files', None) - if (not incrementalFiles) and (not targetBranchFiles): - logger.warning('skipped: files data is missing') - return - - incrementalFilesNames = [] - for incrementalFile in incrementalFiles: - if incrementalFile.get('filename', None): - incrementalFilesNames.append(incrementalFile.get('filename')) - files = [] - for targetBranchFile in targetBranchFiles: - if targetBranchFile.get('filename', None) in incrementalFilesNames: - files.append(targetBranchFile) - if len(files) == 0: - logger.warning('skipped: files is null') - return - - filterSelectedFiles = [] - filterIgnoredFiles = [] - for aFile in files: - if options.checkPath(aFile.get('filename', None)): - filterSelectedFiles.append(aFile) - else: - logger.info('skip for excluded path: %s'%(aFile.get('filename', None))) - filterIgnoredFiles.append(aFile) - if len(filterSelectedFiles) == 0: - logger.warning('skipped: filterSelectedFiles is null') - return - - # 获取本次pr的所有commitid - commits = [] - if incrementalDiff.get('commits', None): - for commit in incrementalDiff.get('commits', None): - commits.append(commit.get('sha', None)) - - if not commits: - logger.warning('skipped: commits is null') - return - - filteredFiles = [] - # 把patch切割成hunk - for aFile in filterSelectedFiles: - if not giteeApi.prNumber: - logger('skipped: pr is null') - continue - - fileContent = '' - try: - # contentUrl = aFile.get('content_url', None) - # contents = json.loads(requests.get(contentUrl).content.decode('utf-8')) - # if contents and contents.get('type', None) == 'file' and contents.get('content', None): - # fileContent = base64.b64decode(contents.get('content', None)).decode('utf-8') - # content_url没有结果,换成raw_url - rawUrl = aFile.get('raw_url', None) - if rawUrl: - fileContent = giteeApi.fetchFileContent(rawUrl) - except Exception as e: - logger.warning('failed to get file contents: %s'%(e)) - - fileDiff = aFile.get('patch', '') - patches = [] - for patch in splitPatch(aFile.get('patch', '')): - patchLines = patchStartEndLine(patch) - if not patchLines: - continue - hunks = parsePatch(patch) - if not hunks: - continue - hunksStr = '''---new_hunk---\n\'\'\'\n%s\n\'\'\'\n---old_hunk---\n\'\'\'\n%s\n\'\'\''''%(hunks.get('newHunk', None), hunks.get('oldHunk', None)) - patches.append([patchLines.get('newHunk', None).get('startLine', None), patchLines.get('newHunk', None).get('endLine', None), hunksStr]) - if len(patches) > 0: - filteredFiles.append([aFile.get('filename', None), fileContent, fileDiff, patches]) - - filesAndChanges = filteredFiles - if len(filesAndChanges) == 0: - logger.error('skipped: no files to review') - return - - statusMsg = { - 'highrestReviewedCommitId': highrestReviewedCommitId, - 'headCommitId': allCommitIds[0], - 'filesAndChanges': filesAndChanges, - 'filterIgnoredFiles': filterIgnoredFiles, - } - - # 跳过summary,直接review - filesAndChangesReview = filesAndChanges - reviewsSkipped = [] - reviewsFailed = [] - skippedFiles = [] - reviewContent = [] - - def doReview(filename, fileContent, patches): - logger.info('reviewing: {}'.format(filename)) - ins = copy.deepcopy(input) - ins.filename = filename - ins.fileDiff = fileDiff - - tokens = getTokenCount(prompts.renderReviewFileDiff(ins)) - - # 计算有多少个hunkstr可以放入prompt - patchesToPack = 0 - for _, _, patch in patches: - patchTokens = getTokenCount(patch) - if tokens + patchTokens > options.TokenLimits: - logger.info('only packing {}/{} patches, tokens: {}/{}'.format(patchesToPack, len(patches), tokens, options.requestTokens)) - break - tokens += patchTokens - patchesToPack += 1 - - patchesPacked = 0 - for startLine, endLine, patch in patches: - if patchesPacked >= patchesToPack: - logger.info('unable to pack more patches into this request, packed: {}, total patches: {}, skipping'.format(patchesPacked, len(patches))) - if options.debug: - logger.info('prompt so far: {}'.format(prompts.renderReviewFileDiff(ins))) - break - patchesPacked += 1 - commentChain = '' - allChians = commenter.getCommentChainsWithinRange(giteeApi.prNumber, filename, startLine, endLine, commenter.COMMENT_REPLY_TAG) - if len(allChians) > 0: - logger.info('Found comment chains: {} for {}'.format(allChians, filename)) - commentChain = allChians - commentChainTokens = getTokenCount(commentChain) - if tokens + commentChainTokens > options.TokenLimits: - commentChain = '' - else: - tokens += commentChainTokens - - ins.patches += patch - if commentChain: - ins.patches += '---comment_chains---\n\'\'\'{}\'\'\'---end_change_section---'.format(commentChain) - if patchesPacked > 0: - print(prompts.renderReviewFileDiff(ins)) - exit() - res = bot.chat(prompts.renderReviewFileDiff(ins)) - - if res.status_code != 200: - logger.info('review: nothing obtained from openai') - reviewsFailed.append('{} (no response)'.format(filename)) - return - ans = parseReview(res) - print('-\n'*5) - print(ans) - reviewContent.append(ans) - - for filename, fileContent, _, patches in filesAndChangesReview: - doReview(filename, fileContent, patches) - - # 添加headSha到COMMIT_ID_TAG中 - summarizeComment = '' - summarizeComment += commenter.addReviewedCommitId(existingCommitIdsBlock, headSha) - - - giteeApi.submitReview(body = reviewContent, commitId = commits[-1]) - giteeApi.submitReview(body = summarizeComment, commitId = commits[-1]) - -def parseReview(response): - data_list = [] - pattern = r'"answer":\s+"([^"]+)"' - for line in response.iter_lines(): - if line: - line_text = line.decode("utf-8") - match = re.search(pattern, line_text) - if match: - result = match.group(1) - data_list.append(result) - - if len(data_list) < 2: - logger.info("no answer") - return - data_list.pop() - combined_result = "".join(data_list) - return combined_result - -def splitPatch(patch): - if not patch: - return [] - results = [] - splitLines = patch.split('\n') - # 去掉最后两行,一行是空格,一行是 \ No newline at end of file - splitLines = splitLines[:-2] - lastLine = -1 - for iLine in range(len(splitLines)): - # 当前行数据格式是否满足:@@ -0,0 +0,0 @@ - reSplit = re.split('^@@ -(\d+),(\d+) \+(\d+),(\d+) @@', splitLines[iLine]) - if len(reSplit) > 1: - if lastLine == -1: - lastLine = iLine - else: - results.append('\n'.join(splitLines[lastLine: iLine])) - lastLine = iLine - if lastLine != -1: - results.append('\n'.join(splitLines[lastLine:])) - return results - -def patchStartEndLine(patch): - reSplit = re.split('^@@ -(\d+),(\d+) \+(\d+),(\d+) @@', patch) - if len(reSplit) > 1: - oldBegin = int(reSplit[1]) - oldDiff = int(reSplit[2]) - newBegin = int(reSplit[3]) - newDiff = int(reSplit[4]) - return {'oldHunk': {'startLine': oldBegin, 'endLine': oldDiff}, 'newHunk': {'startLine': newBegin, 'endLine': newDiff}} - else: - return None - -def parsePatch(patch): - hunkInfo = patchStartEndLine(patch) - if not hunkInfo: - return - oldHunkLines = [] - newHunkLines = [] - newLine = hunkInfo.get('newHunk', None).get('startLine', None) - lines = patch.split('\n') [1:] # 去除第一行@@ - if lines[-1] == '': # 去除最后一行空格 - lines = lines[:-1] - skipStart = 3 - skipEnd = 3 - currentLine = 0 - - # reamovalOnly=True代表只删除内容,没有新增内容 - removalOnly = True - for line in lines: - if line.startswith('+'): - removalOnly = False - break - - for line in lines: - currentLine += 1 - if line.startswith('-'): - oldHunkLines.append(line[1:]) - elif line.startswith('+'): - newHunkLines.append(line[1:]) - newLine += 1 - else: - oldHunkLines.append(line) - if removalOnly or (currentLine > skipStart and currentLine <= len(lines) - skipEnd): - newHunkLines.append(str(newLine) + ': ' + line) - else: - newHunkLines.append(line) - newLine += 1 - return {"oldHunk": '\n'.join(oldHunkLines), "newHunk": '\n'.join(newHunkLines)} \ No newline at end of file diff --git a/prreivew/src/reviewCode/tokenizer.py b/prreivew/src/reviewCode/tokenizer.py deleted file mode 100644 index 3823f0626867b39ad0df8d140cc88454d000b59f..0000000000000000000000000000000000000000 --- a/prreivew/src/reviewCode/tokenizer.py +++ /dev/null @@ -1,9 +0,0 @@ -import tiktoken - - -def getTokenCount(strIn): - encoding = tiktoken.get_encoding('cl100k_base') - tokens = encoding.encode(strIn) - return len(tokens) - -