Files
TechHelper/TechHelper.Client/Exam/ExamParse.cs
SpecialX a21ca80782 1
2025-06-27 19:03:10 +08:00

504 lines
18 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using Entities.Contracts; // 假设这些实体合约仍然是必需的
using System.Text.RegularExpressions;
using System.Text;
namespace TechHelper.Client.Exam
{
public enum ParseErrorType
{
Validation = 1,
DataParsing = 2,
Structural = 3,
RegexMatchIssue = 4,
UnexpectedError = 5
}
public class ParseError
{
public ParseErrorType Type { get; }
public string Message { get; }
public int? Index { get; }
public string MatchedText { get; }
public Exception InnerException { get; }
public ParseError(ParseErrorType type, string message, int? index = null, string matchedText = null, Exception innerException = null)
{
Type = type;
Message = message;
Index = index;
MatchedText = matchedText;
InnerException = innerException;
}
public override string ToString()
{
var sb = new StringBuilder();
sb.Append($"[{Type}] {Message}");
if (Index.HasValue) sb.Append($" (Index: {Index.Value})");
if (!string.IsNullOrEmpty(MatchedText)) sb.Append($" (MatchedText: '{MatchedText}')");
if (InnerException != null) sb.Append($" InnerException: {InnerException.Message}");
return sb.ToString();
}
}
public class AssignmentEx
{
public string Title { get; set; } = "Title";
public string Description { get; set; } = "Description";
public SubjectAreaEnum SubjectArea { get; set; } = SubjectAreaEnum.Unknown;
public AssignmentQuestionEx ExamStruct { get; set; } = new AssignmentQuestionEx();
public List<ParseError> Errors { get; set; } = new List<ParseError>();
}
public class AssignmentQuestionEx
{
public string Title { get; set; } = string.Empty;
public string Description { get; set; } = string.Empty;
public byte Index { get; set; } = 0;
public float Score { get; set; }
public string Sequence { get; set; } = string.Empty;
public QuestionEx? Question { get; set; }
public AssignmentStructType Type { get; set; }
public List<AssignmentQuestionEx> ChildrenAssignmentQuestion { get; set; } = new List<AssignmentQuestionEx>();
public int Priority { get; set; }
}
public class QuestionEx
{
public string Title { get; set; } = string.Empty;
public string Answer { get; set; } = string.Empty;
public List<Option> Options { get; set; } = new List<Option>();
}
public class Option
{
public string Label { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
}
/// <summary>
/// 表示一个带有优先级的正则表达式配置
/// </summary>
public class RegexPatternConfig
{
public string Pattern { get; set; }
public int Priority { get; set; }
public AssignmentStructType Type { get; set; }
public Regex Regex { get; private set; }
public RegexPatternConfig(string pattern, int priority, AssignmentStructType type = AssignmentStructType.Question)
{
Pattern = pattern;
Priority = priority;
Type = type;
Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled);
}
}
/// <summary>
/// 试卷解析的配置类,包含所有正则表达式
/// </summary>
public class ExamParserConfig
{
public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();
public Regex ScoreRegex { get; private set; } // 独立的得分正则表达式
public ExamParserConfig()
{
// 题目/题组模式:只匹配行开头,并按优先级区分
// Group 1: 编号部分
// Group 2: 题目/题组标题内容
// 例如:一. 这是大题一
QuestionPatterns.Add(new RegexPatternConfig(@"^([一二三四五六七八九十]+)[.\、]\s*(.+)", 1, AssignmentStructType.Struct));
// 例如:(一) 这是第一子题组
QuestionPatterns.Add(new RegexPatternConfig(@"^\(([一二三四五六七八九十]{1,2}|十[一二三四五六七八九])\)\s*(.+)", 2, AssignmentStructType.Composite));
// 例如1. 这是第一道题目 或 1 这是第一道题目
QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.?\s*(.+)", 3, AssignmentStructType.Question));
// 例如:(1). 这是小问一 或 (1) 这是小问一
QuestionPatterns.Add(new RegexPatternConfig(@"^\((\d+)\)\.?\s*(.+)", 4, AssignmentStructType.Question));
// 例如:① 这是另一种小问 或 ①. 这是另一种小问 (如果 ① 后面会跟点,这个更通用)
// 如果 ① 后面通常没有点,但您希望它也能匹配,则保留原样或根据实际情况调整
QuestionPatterns.Add(new RegexPatternConfig(@"^[①②③④⑤⑥⑦⑧⑨⑩]+\.?\s*(.+)", 5, AssignmentStructType.Question));
// 选项模式 (保持不变,使用 AssignmentStructType.Option 区分)
OptionPatterns.Add(new RegexPatternConfig(@"([A-Z]\.)\s*(.*?)(?=[A-Z]\.|$)", 1, AssignmentStructType.Option));
OptionPatterns.Add(new RegexPatternConfig(@"([a-z]\.)\s*(.*?)(?=[a-z]\.|$)", 2, AssignmentStructType.Option));
// 独立的得分正则表达式:匹配行末尾的 "(X分)" 格式
// Group 1: 捕获分数(如 "10" 或 "0.5"
ScoreRegex = new Regex(@"(?:\s*\(((\d+(?:\.\d+)?))\s*分\)\s*$)", RegexOptions.Multiline | RegexOptions.Compiled);
}
}
public class PotentialMatch
{
public int StartIndex { get; set; }
public int EndIndex { get; set; }
public string MatchedText { get; set; }
public Match RegexMatch { get; set; }
public RegexPatternConfig PatternConfig { get; set; }
}
public class ExamDocumentScanner
{
private readonly ExamParserConfig _config;
public ExamDocumentScanner(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config));
}
public List<PotentialMatch> Scan(string text, List<ParseError> errors)
{
if (string.IsNullOrEmpty(text))
{
return new List<PotentialMatch>();
}
var allPotentialMatches = new List<PotentialMatch>();
var allPatternConfigs = new List<RegexPatternConfig>();
allPatternConfigs.AddRange(_config.QuestionPatterns);
allPatternConfigs.AddRange(_config.OptionPatterns);
foreach (var patternConfig in allPatternConfigs)
{
try
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
});
}
}
catch (Exception ex)
{
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An error occurred during regex matching for pattern: '{patternConfig.Pattern}'.",
innerException: ex));
}
}
return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
}
}
public class ExamStructureBuilder
{
private readonly ExamParserConfig _config;
public ExamStructureBuilder(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config), "ExamParserConfig cannot be null.");
}
public AssignmentEx BuildExam(string fullExamText, List<PotentialMatch> allPotentialMatches)
{
if (string.IsNullOrWhiteSpace(fullExamText))
{
throw new ArgumentException("Full exam text cannot be null or empty.", nameof(fullExamText));
}
if (allPotentialMatches == null)
{
throw new ArgumentNullException(nameof(allPotentialMatches), "Potential matches list cannot be null.");
}
var assignment = new AssignmentEx();
try
{
assignment.Title = GetExamTitle(fullExamText);
}
catch (Exception ex)
{
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError, "Failed to extract exam title.", innerException: ex));
assignment.Title = "未识别试卷标题";
}
var assignmentQuestionStack = new Stack<AssignmentQuestionEx>();
var rootAssignmentQuestion = new AssignmentQuestionEx { Type = AssignmentStructType.Struct, Priority = 0, Title = "Root Exam Structure" };
assignmentQuestionStack.Push(rootAssignmentQuestion);
assignment.ExamStruct = rootAssignmentQuestion;
int currentContentStart = 0;
if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
{
string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
if (!string.IsNullOrWhiteSpace(introText))
{
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + introText;
}
}
currentContentStart = allPotentialMatches.Any() ? allPotentialMatches[0].StartIndex : 0;
for (int i = 0; i < allPotentialMatches.Count; i++)
{
var pm = allPotentialMatches[i];
try
{
if (!IsValidPotentialMatch(pm, i, fullExamText.Length, currentContentStart, assignment.Errors))
{
currentContentStart = Math.Max(currentContentStart, pm.EndIndex);
continue;
}
string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(precedingText))
{
if (assignmentQuestionStack.Peek().Question != null)
{
ProcessQuestionContent(assignmentQuestionStack.Peek(), precedingText, assignment.Errors);
}
else
{
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + precedingText;
}
}
if (pm.PatternConfig.Type == AssignmentStructType.Option)
{
HandleOptionMatch(pm, i, assignmentQuestionStack.Peek(), assignment.Errors);
}
else
{
HandleQuestionGroupMatch(pm, i, assignmentQuestionStack, assignment.Errors);
}
currentContentStart = pm.EndIndex;
}
catch (Exception ex)
{
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during main loop processing of PotentialMatch at index {i}.",
index: i, matchedText: pm.MatchedText, innerException: ex));
currentContentStart = Math.Max(currentContentStart, pm.EndIndex);
}
}
if (currentContentStart < fullExamText.Length)
{
try
{
string remainingText = fullExamText.Substring(currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(remainingText))
{
if (assignmentQuestionStack.Peek().Question != null)
{
ProcessQuestionContent(assignmentQuestionStack.Peek(), remainingText, assignment.Errors);
}
else
{
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + remainingText;
}
}
}
catch (Exception ex)
{
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
"An unexpected error occurred while processing remaining text after all potential matches.",
innerException: ex));
}
}
return assignment;
}
private bool IsValidPotentialMatch(PotentialMatch pm, int index, int fullTextLength, int currentContentStart, List<ParseError> errors)
{
if (pm.StartIndex < currentContentStart || pm.EndIndex > fullTextLength || pm.StartIndex > pm.EndIndex)
{
errors.Add(new ParseError(ParseErrorType.Validation,
$"PotentialMatch at index {index} has invalid start/end indices. Start: {pm.StartIndex}, End: {pm.EndIndex}, CurrentContentStart: {currentContentStart}, FullTextLength: {fullTextLength}",
index: index, matchedText: pm.MatchedText));
return false;
}
if (pm.RegexMatch == null || pm.PatternConfig == null)
{
errors.Add(new ParseError(ParseErrorType.Validation,
$"PotentialMatch at index {index} is missing RegexMatch or PatternConfig.",
index: index, matchedText: pm.MatchedText));
return false;
}
return true;
}
private void HandleQuestionGroupMatch(PotentialMatch pm, int index, Stack<AssignmentQuestionEx> assignmentQuestionStack, List<ParseError> errors)
{
try
{
while (assignmentQuestionStack.Count > 1 && pm.PatternConfig.Priority <= assignmentQuestionStack.Peek().Priority)
{
assignmentQuestionStack.Pop();
}
string sequence = assignmentQuestionStack.Count > 0 ? assignmentQuestionStack.Peek().Sequence : string.Empty;
// 验证捕获组Group 1 是编号Group 2 是题目内容
if (pm.RegexMatch.Groups.Count < 3 || !pm.RegexMatch.Groups[1].Success || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
{
errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"Question/Group match at index {index} does not have enough regex groups (expected 3 for number and title) or a valid title group (Group 2). Skipping this group.",
index: index, matchedText: pm.MatchedText));
return;
}
float score = 0;
// 尝试从 MatchedText 的末尾匹配分数
Match scoreMatch = _config.ScoreRegex.Match(pm.MatchedText);
if (scoreMatch.Success && scoreMatch.Groups.Count > 1 && scoreMatch.Groups[1].Success)
{
if (!float.TryParse(scoreMatch.Groups[1].Value, out score))
{
errors.Add(new ParseError(ParseErrorType.DataParsing,
$"Failed to parse score '{scoreMatch.Groups[1].Value}' for match at index {index}. Defaulting to 0.",
index: index, matchedText: pm.MatchedText));
}
// 从 MatchedText 中移除分数部分,使其只包含编号和标题
// 注意这里修改的是pm.MatchedText这不会影响原始文本只是当前匹配项的“内容”
pm.MatchedText = pm.MatchedText.Substring(0, scoreMatch.Index).Trim();
}
// 提取标题,这里使用 Group 2 的值,它不包含分数
string title = pm.RegexMatch.Groups[2].Value.Trim();
string seq = pm.RegexMatch.Groups[1].Value.Trim();
seq = string.IsNullOrEmpty(seq) || string.IsNullOrEmpty(sequence) ? seq : " ." + seq;
AssignmentQuestionEx newAssignmentQuestion;
if (pm.PatternConfig.Type == AssignmentStructType.Struct)
{
newAssignmentQuestion = new AssignmentQuestionEx
{
Title = title,
Score = score,
Sequence = sequence + seq,
Priority = pm.PatternConfig.Priority,
Type = pm.PatternConfig.Type
};
}
else // AssignmentStructType.Question 类型
{
newAssignmentQuestion = new AssignmentQuestionEx
{
Priority = pm.PatternConfig.Priority,
Type = pm.PatternConfig.Type,
Sequence = sequence + seq,
Score = score,
Question = new QuestionEx
{
Title = title,
}
};
}
assignmentQuestionStack.Peek().ChildrenAssignmentQuestion.Add(newAssignmentQuestion);
assignmentQuestionStack.Push(newAssignmentQuestion);
}
catch (Exception ex)
{
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing a non-option match (type: {pm.PatternConfig.Type}) at index {index}.",
index: index, matchedText: pm.MatchedText, innerException: ex));
}
}
private void HandleOptionMatch(PotentialMatch pm, int index, AssignmentQuestionEx currentAssignmentQuestion, List<ParseError> errors)
{
try
{
if (currentAssignmentQuestion.Question == null)
{
errors.Add(new ParseError(ParseErrorType.Structural,
$"Found isolated Option at index {index}. Options must belong to a 'Question' type structure. Ignoring this option.",
index: index, matchedText: pm.MatchedText));
return;
}
if (pm.RegexMatch.Groups.Count < 3 || !pm.RegexMatch.Groups[1].Success || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
{
errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"Option match at index {index} does not have enough regex groups or valid label/text groups (Group 1/2). Skipping this option.",
index: index, matchedText: pm.MatchedText));
return;
}
Option newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
currentAssignmentQuestion.Question.Options.Add(newOption);
}
catch (Exception ex)
{
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing Option at index {index}.",
index: index, matchedText: pm.MatchedText, innerException: ex));
}
}
private void ProcessQuestionContent(AssignmentQuestionEx question, string contentText, List<ParseError> errors)
{
if (question?.Question == null)
{
errors.Add(new ParseError(ParseErrorType.Structural,
$"Attempted to process content for a non-question type AssignmentQuestionEx (Type: {question?.Type}). Content: '{contentText}'",
matchedText: contentText));
return;
}
if (!string.IsNullOrWhiteSpace(contentText))
{
question.Question.Title += (string.IsNullOrWhiteSpace(question.Question.Title) ? "" : "\n") + contentText;
}
}
private string GetExamTitle(string examPaperText)
{
var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
return firstLine ?? "未识别试卷标题";
}
}
public class ExamParser
{
private readonly ExamParserConfig _config;
private readonly ExamDocumentScanner _scanner;
private readonly ExamStructureBuilder _builder;
public ExamParser(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config));
_scanner = new ExamDocumentScanner(_config);
_builder = new ExamStructureBuilder(_config);
}
/// <summary>
/// 解析给定的试卷文本,返回结构化的 AssignmentEx 对象。
/// </summary>
/// <param name="examPaperText">完整的试卷文本</param>
/// <returns>解析后的 AssignmentEx 对象</returns>
public AssignmentEx ParseExamPaper(string examPaperText)
{
var assignment = new AssignmentEx();
List<PotentialMatch> allPotentialMatches = _scanner.Scan(examPaperText, assignment.Errors);
assignment = _builder.BuildExam(examPaperText, allPotentialMatches);
return assignment;
}
}
}