504 lines
18 KiB
C#
504 lines
18 KiB
C#
using Entities.Contracts; // 假设这些实体合约仍然是必需的
|
||
using System.Text.RegularExpressions;
|
||
using System.Text;
|
||
|
||
namespace TechHelper.Client.Exam
|
||
{
|
||
public enum ParseErrorType
|
||
{
|
||
Validation = 1,
|
||
DataParsing = 2,
|
||
Structural = 3,
|
||
RegexMatchIssue = 4,
|
||
UnexpectedError = 5
|
||
}
|
||
|
||
public class ParseError
|
||
{
|
||
public ParseErrorType Type { get; }
|
||
public string Message { get; }
|
||
public int? Index { get; }
|
||
public string MatchedText { get; }
|
||
public Exception InnerException { get; }
|
||
|
||
public ParseError(ParseErrorType type, string message, int? index = null, string matchedText = null, Exception innerException = null)
|
||
{
|
||
Type = type;
|
||
Message = message;
|
||
Index = index;
|
||
MatchedText = matchedText;
|
||
InnerException = innerException;
|
||
}
|
||
|
||
public override string ToString()
|
||
{
|
||
var sb = new StringBuilder();
|
||
sb.Append($"[{Type}] {Message}");
|
||
if (Index.HasValue) sb.Append($" (Index: {Index.Value})");
|
||
if (!string.IsNullOrEmpty(MatchedText)) sb.Append($" (MatchedText: '{MatchedText}')");
|
||
if (InnerException != null) sb.Append($" InnerException: {InnerException.Message}");
|
||
return sb.ToString();
|
||
}
|
||
}
|
||
|
||
public class AssignmentEx
|
||
{
|
||
public string Title { get; set; } = "Title";
|
||
public string Description { get; set; } = "Description";
|
||
public SubjectAreaEnum SubjectArea { get; set; } = SubjectAreaEnum.Unknown;
|
||
public AssignmentQuestionEx ExamStruct { get; set; } = new AssignmentQuestionEx();
|
||
public List<ParseError> Errors { get; set; } = new List<ParseError>();
|
||
}
|
||
|
||
public class AssignmentQuestionEx
|
||
{
|
||
public string Title { get; set; } = string.Empty;
|
||
public string Description { get; set; } = string.Empty;
|
||
public byte Index { get; set; } = 0;
|
||
public float Score { get; set; }
|
||
public string Sequence { get; set; } = string.Empty;
|
||
public QuestionEx? Question { get; set; }
|
||
public AssignmentStructType Type { get; set; }
|
||
public List<AssignmentQuestionEx> ChildrenAssignmentQuestion { get; set; } = new List<AssignmentQuestionEx>();
|
||
public int Priority { get; set; }
|
||
}
|
||
|
||
public class QuestionEx
|
||
{
|
||
public string Title { get; set; } = string.Empty;
|
||
public string Answer { get; set; } = string.Empty;
|
||
public List<Option> Options { get; set; } = new List<Option>();
|
||
}
|
||
|
||
public class Option
|
||
{
|
||
public string Label { get; set; } = string.Empty;
|
||
public string Text { get; set; } = string.Empty;
|
||
}
|
||
|
||
|
||
/// <summary>
|
||
/// 表示一个带有优先级的正则表达式配置
|
||
/// </summary>
|
||
public class RegexPatternConfig
|
||
{
|
||
public string Pattern { get; set; }
|
||
public int Priority { get; set; }
|
||
public AssignmentStructType Type { get; set; }
|
||
public Regex Regex { get; private set; }
|
||
|
||
public RegexPatternConfig(string pattern, int priority, AssignmentStructType type = AssignmentStructType.Question)
|
||
{
|
||
Pattern = pattern;
|
||
Priority = priority;
|
||
Type = type;
|
||
Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// 试卷解析的配置类,包含所有正则表达式
|
||
/// </summary>
|
||
public class ExamParserConfig
|
||
{
|
||
public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
|
||
public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();
|
||
public Regex ScoreRegex { get; private set; } // 独立的得分正则表达式
|
||
|
||
public ExamParserConfig()
|
||
{
|
||
// 题目/题组模式:只匹配行开头,并按优先级区分
|
||
// Group 1: 编号部分
|
||
// Group 2: 题目/题组标题内容
|
||
|
||
// 例如:一. 这是大题一
|
||
QuestionPatterns.Add(new RegexPatternConfig(@"^([一二三四五六七八九十]+)[.\、]\s*(.+)", 1, AssignmentStructType.Struct));
|
||
|
||
// 例如:(一) 这是第一子题组
|
||
QuestionPatterns.Add(new RegexPatternConfig(@"^\(([一二三四五六七八九十]{1,2}|十[一二三四五六七八九])\)\s*(.+)", 2, AssignmentStructType.Composite));
|
||
|
||
// 例如:1. 这是第一道题目 或 1 这是第一道题目
|
||
QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.?\s*(.+)", 3, AssignmentStructType.Question));
|
||
|
||
// 例如:(1). 这是小问一 或 (1) 这是小问一
|
||
QuestionPatterns.Add(new RegexPatternConfig(@"^\((\d+)\)\.?\s*(.+)", 4, AssignmentStructType.Question));
|
||
|
||
// 例如:① 这是另一种小问 或 ①. 这是另一种小问 (如果 ① 后面会跟点,这个更通用)
|
||
// 如果 ① 后面通常没有点,但您希望它也能匹配,则保留原样或根据实际情况调整
|
||
QuestionPatterns.Add(new RegexPatternConfig(@"^[①②③④⑤⑥⑦⑧⑨⑩]+\.?\s*(.+)", 5, AssignmentStructType.Question));
|
||
|
||
|
||
|
||
// 选项模式 (保持不变,使用 AssignmentStructType.Option 区分)
|
||
OptionPatterns.Add(new RegexPatternConfig(@"([A-Z]\.)\s*(.*?)(?=[A-Z]\.|$)", 1, AssignmentStructType.Option));
|
||
OptionPatterns.Add(new RegexPatternConfig(@"([a-z]\.)\s*(.*?)(?=[a-z]\.|$)", 2, AssignmentStructType.Option));
|
||
|
||
// 独立的得分正则表达式:匹配行末尾的 "(X分)" 格式
|
||
// Group 1: 捕获分数(如 "10" 或 "0.5")
|
||
ScoreRegex = new Regex(@"(?:\s*\(((\d+(?:\.\d+)?))\s*分\)\s*$)", RegexOptions.Multiline | RegexOptions.Compiled);
|
||
}
|
||
}
|
||
|
||
public class PotentialMatch
|
||
{
|
||
public int StartIndex { get; set; }
|
||
public int EndIndex { get; set; }
|
||
public string MatchedText { get; set; }
|
||
public Match RegexMatch { get; set; }
|
||
public RegexPatternConfig PatternConfig { get; set; }
|
||
}
|
||
|
||
public class ExamDocumentScanner
|
||
{
|
||
private readonly ExamParserConfig _config;
|
||
|
||
public ExamDocumentScanner(ExamParserConfig config)
|
||
{
|
||
_config = config ?? throw new ArgumentNullException(nameof(config));
|
||
}
|
||
|
||
public List<PotentialMatch> Scan(string text, List<ParseError> errors)
|
||
{
|
||
if (string.IsNullOrEmpty(text))
|
||
{
|
||
return new List<PotentialMatch>();
|
||
}
|
||
|
||
var allPotentialMatches = new List<PotentialMatch>();
|
||
var allPatternConfigs = new List<RegexPatternConfig>();
|
||
allPatternConfigs.AddRange(_config.QuestionPatterns);
|
||
allPatternConfigs.AddRange(_config.OptionPatterns);
|
||
|
||
foreach (var patternConfig in allPatternConfigs)
|
||
{
|
||
try
|
||
{
|
||
foreach (Match match in patternConfig.Regex.Matches(text))
|
||
{
|
||
allPotentialMatches.Add(new PotentialMatch
|
||
{
|
||
StartIndex = match.Index,
|
||
EndIndex = match.Index + match.Length,
|
||
MatchedText = match.Value,
|
||
RegexMatch = match,
|
||
PatternConfig = patternConfig,
|
||
});
|
||
}
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
|
||
$"An error occurred during regex matching for pattern: '{patternConfig.Pattern}'.",
|
||
innerException: ex));
|
||
}
|
||
}
|
||
return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
|
||
}
|
||
}
|
||
|
||
public class ExamStructureBuilder
|
||
{
|
||
private readonly ExamParserConfig _config;
|
||
|
||
public ExamStructureBuilder(ExamParserConfig config)
|
||
{
|
||
_config = config ?? throw new ArgumentNullException(nameof(config), "ExamParserConfig cannot be null.");
|
||
}
|
||
|
||
public AssignmentEx BuildExam(string fullExamText, List<PotentialMatch> allPotentialMatches)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(fullExamText))
|
||
{
|
||
throw new ArgumentException("Full exam text cannot be null or empty.", nameof(fullExamText));
|
||
}
|
||
if (allPotentialMatches == null)
|
||
{
|
||
throw new ArgumentNullException(nameof(allPotentialMatches), "Potential matches list cannot be null.");
|
||
}
|
||
|
||
var assignment = new AssignmentEx();
|
||
try
|
||
{
|
||
assignment.Title = GetExamTitle(fullExamText);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError, "Failed to extract exam title.", innerException: ex));
|
||
assignment.Title = "未识别试卷标题";
|
||
}
|
||
|
||
var assignmentQuestionStack = new Stack<AssignmentQuestionEx>();
|
||
var rootAssignmentQuestion = new AssignmentQuestionEx { Type = AssignmentStructType.Struct, Priority = 0, Title = "Root Exam Structure" };
|
||
assignmentQuestionStack.Push(rootAssignmentQuestion);
|
||
assignment.ExamStruct = rootAssignmentQuestion;
|
||
|
||
int currentContentStart = 0;
|
||
|
||
if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
|
||
{
|
||
string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
|
||
if (!string.IsNullOrWhiteSpace(introText))
|
||
{
|
||
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + introText;
|
||
}
|
||
}
|
||
currentContentStart = allPotentialMatches.Any() ? allPotentialMatches[0].StartIndex : 0;
|
||
|
||
for (int i = 0; i < allPotentialMatches.Count; i++)
|
||
{
|
||
var pm = allPotentialMatches[i];
|
||
|
||
try
|
||
{
|
||
if (!IsValidPotentialMatch(pm, i, fullExamText.Length, currentContentStart, assignment.Errors))
|
||
{
|
||
currentContentStart = Math.Max(currentContentStart, pm.EndIndex);
|
||
continue;
|
||
}
|
||
|
||
string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
|
||
if (!string.IsNullOrWhiteSpace(precedingText))
|
||
{
|
||
if (assignmentQuestionStack.Peek().Question != null)
|
||
{
|
||
ProcessQuestionContent(assignmentQuestionStack.Peek(), precedingText, assignment.Errors);
|
||
}
|
||
else
|
||
{
|
||
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + precedingText;
|
||
}
|
||
}
|
||
|
||
if (pm.PatternConfig.Type == AssignmentStructType.Option)
|
||
{
|
||
HandleOptionMatch(pm, i, assignmentQuestionStack.Peek(), assignment.Errors);
|
||
}
|
||
else
|
||
{
|
||
HandleQuestionGroupMatch(pm, i, assignmentQuestionStack, assignment.Errors);
|
||
}
|
||
|
||
currentContentStart = pm.EndIndex;
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
|
||
$"An unexpected error occurred during main loop processing of PotentialMatch at index {i}.",
|
||
index: i, matchedText: pm.MatchedText, innerException: ex));
|
||
currentContentStart = Math.Max(currentContentStart, pm.EndIndex);
|
||
}
|
||
}
|
||
|
||
if (currentContentStart < fullExamText.Length)
|
||
{
|
||
try
|
||
{
|
||
string remainingText = fullExamText.Substring(currentContentStart).Trim();
|
||
if (!string.IsNullOrWhiteSpace(remainingText))
|
||
{
|
||
if (assignmentQuestionStack.Peek().Question != null)
|
||
{
|
||
ProcessQuestionContent(assignmentQuestionStack.Peek(), remainingText, assignment.Errors);
|
||
}
|
||
else
|
||
{
|
||
assignment.Description += (string.IsNullOrWhiteSpace(assignment.Description) ? "" : "\n") + remainingText;
|
||
}
|
||
}
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
assignment.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
|
||
"An unexpected error occurred while processing remaining text after all potential matches.",
|
||
innerException: ex));
|
||
}
|
||
}
|
||
|
||
return assignment;
|
||
}
|
||
|
||
private bool IsValidPotentialMatch(PotentialMatch pm, int index, int fullTextLength, int currentContentStart, List<ParseError> errors)
|
||
{
|
||
if (pm.StartIndex < currentContentStart || pm.EndIndex > fullTextLength || pm.StartIndex > pm.EndIndex)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.Validation,
|
||
$"PotentialMatch at index {index} has invalid start/end indices. Start: {pm.StartIndex}, End: {pm.EndIndex}, CurrentContentStart: {currentContentStart}, FullTextLength: {fullTextLength}",
|
||
index: index, matchedText: pm.MatchedText));
|
||
return false;
|
||
}
|
||
if (pm.RegexMatch == null || pm.PatternConfig == null)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.Validation,
|
||
$"PotentialMatch at index {index} is missing RegexMatch or PatternConfig.",
|
||
index: index, matchedText: pm.MatchedText));
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
private void HandleQuestionGroupMatch(PotentialMatch pm, int index, Stack<AssignmentQuestionEx> assignmentQuestionStack, List<ParseError> errors)
|
||
{
|
||
try
|
||
{
|
||
while (assignmentQuestionStack.Count > 1 && pm.PatternConfig.Priority <= assignmentQuestionStack.Peek().Priority)
|
||
{
|
||
assignmentQuestionStack.Pop();
|
||
}
|
||
|
||
string sequence = assignmentQuestionStack.Count > 0 ? assignmentQuestionStack.Peek().Sequence : string.Empty;
|
||
|
||
// 验证捕获组:Group 1 是编号,Group 2 是题目内容
|
||
if (pm.RegexMatch.Groups.Count < 3 || !pm.RegexMatch.Groups[1].Success || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
|
||
$"Question/Group match at index {index} does not have enough regex groups (expected 3 for number and title) or a valid title group (Group 2). Skipping this group.",
|
||
index: index, matchedText: pm.MatchedText));
|
||
return;
|
||
}
|
||
|
||
float score = 0;
|
||
// 尝试从 MatchedText 的末尾匹配分数
|
||
Match scoreMatch = _config.ScoreRegex.Match(pm.MatchedText);
|
||
if (scoreMatch.Success && scoreMatch.Groups.Count > 1 && scoreMatch.Groups[1].Success)
|
||
{
|
||
if (!float.TryParse(scoreMatch.Groups[1].Value, out score))
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.DataParsing,
|
||
$"Failed to parse score '{scoreMatch.Groups[1].Value}' for match at index {index}. Defaulting to 0.",
|
||
index: index, matchedText: pm.MatchedText));
|
||
}
|
||
// 从 MatchedText 中移除分数部分,使其只包含编号和标题
|
||
// 注意:这里修改的是pm.MatchedText,这不会影响原始文本,只是当前匹配项的“内容”
|
||
pm.MatchedText = pm.MatchedText.Substring(0, scoreMatch.Index).Trim();
|
||
}
|
||
|
||
// 提取标题,这里使用 Group 2 的值,它不包含分数
|
||
string title = pm.RegexMatch.Groups[2].Value.Trim();
|
||
|
||
string seq = pm.RegexMatch.Groups[1].Value.Trim();
|
||
seq = string.IsNullOrEmpty(seq) || string.IsNullOrEmpty(sequence) ? seq : " ." + seq;
|
||
|
||
AssignmentQuestionEx newAssignmentQuestion;
|
||
if (pm.PatternConfig.Type == AssignmentStructType.Struct)
|
||
{
|
||
newAssignmentQuestion = new AssignmentQuestionEx
|
||
{
|
||
Title = title,
|
||
Score = score,
|
||
Sequence = sequence + seq,
|
||
Priority = pm.PatternConfig.Priority,
|
||
Type = pm.PatternConfig.Type
|
||
};
|
||
}
|
||
else // AssignmentStructType.Question 类型
|
||
{
|
||
newAssignmentQuestion = new AssignmentQuestionEx
|
||
{
|
||
Priority = pm.PatternConfig.Priority,
|
||
Type = pm.PatternConfig.Type,
|
||
Sequence = sequence + seq,
|
||
Score = score,
|
||
Question = new QuestionEx
|
||
{
|
||
Title = title,
|
||
}
|
||
};
|
||
}
|
||
|
||
assignmentQuestionStack.Peek().ChildrenAssignmentQuestion.Add(newAssignmentQuestion);
|
||
assignmentQuestionStack.Push(newAssignmentQuestion);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
|
||
$"An unexpected error occurred during processing a non-option match (type: {pm.PatternConfig.Type}) at index {index}.",
|
||
index: index, matchedText: pm.MatchedText, innerException: ex));
|
||
}
|
||
}
|
||
|
||
private void HandleOptionMatch(PotentialMatch pm, int index, AssignmentQuestionEx currentAssignmentQuestion, List<ParseError> errors)
|
||
{
|
||
try
|
||
{
|
||
if (currentAssignmentQuestion.Question == null)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.Structural,
|
||
$"Found isolated Option at index {index}. Options must belong to a 'Question' type structure. Ignoring this option.",
|
||
index: index, matchedText: pm.MatchedText));
|
||
return;
|
||
}
|
||
|
||
if (pm.RegexMatch.Groups.Count < 3 || !pm.RegexMatch.Groups[1].Success || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
|
||
$"Option match at index {index} does not have enough regex groups or valid label/text groups (Group 1/2). Skipping this option.",
|
||
index: index, matchedText: pm.MatchedText));
|
||
return;
|
||
}
|
||
|
||
Option newOption = new Option
|
||
{
|
||
Label = pm.RegexMatch.Groups[1].Value.Trim(),
|
||
Text = pm.RegexMatch.Groups[2].Value.Trim()
|
||
};
|
||
currentAssignmentQuestion.Question.Options.Add(newOption);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
|
||
$"An unexpected error occurred during processing Option at index {index}.",
|
||
index: index, matchedText: pm.MatchedText, innerException: ex));
|
||
}
|
||
}
|
||
|
||
private void ProcessQuestionContent(AssignmentQuestionEx question, string contentText, List<ParseError> errors)
|
||
{
|
||
if (question?.Question == null)
|
||
{
|
||
errors.Add(new ParseError(ParseErrorType.Structural,
|
||
$"Attempted to process content for a non-question type AssignmentQuestionEx (Type: {question?.Type}). Content: '{contentText}'",
|
||
matchedText: contentText));
|
||
return;
|
||
}
|
||
|
||
if (!string.IsNullOrWhiteSpace(contentText))
|
||
{
|
||
question.Question.Title += (string.IsNullOrWhiteSpace(question.Question.Title) ? "" : "\n") + contentText;
|
||
}
|
||
}
|
||
|
||
private string GetExamTitle(string examPaperText)
|
||
{
|
||
var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
|
||
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
|
||
return firstLine ?? "未识别试卷标题";
|
||
}
|
||
}
|
||
|
||
public class ExamParser
|
||
{
|
||
private readonly ExamParserConfig _config;
|
||
private readonly ExamDocumentScanner _scanner;
|
||
private readonly ExamStructureBuilder _builder;
|
||
|
||
public ExamParser(ExamParserConfig config)
|
||
{
|
||
_config = config ?? throw new ArgumentNullException(nameof(config));
|
||
_scanner = new ExamDocumentScanner(_config);
|
||
_builder = new ExamStructureBuilder(_config);
|
||
}
|
||
|
||
/// <summary>
|
||
/// 解析给定的试卷文本,返回结构化的 AssignmentEx 对象。
|
||
/// </summary>
|
||
/// <param name="examPaperText">完整的试卷文本</param>
|
||
/// <returns>解析后的 AssignmentEx 对象</returns>
|
||
public AssignmentEx ParseExamPaper(string examPaperText)
|
||
{
|
||
var assignment = new AssignmentEx();
|
||
List<PotentialMatch> allPotentialMatches = _scanner.Scan(examPaperText, assignment.Errors);
|
||
assignment = _builder.BuildExam(examPaperText, allPotentialMatches);
|
||
return assignment;
|
||
}
|
||
}
|
||
} |