Files
TechHelper/TechHelper.Client/Exam/ExamParse.cs
2025-06-20 18:58:11 +08:00

761 lines
28 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using Entities.DTO;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace TechHelper.Client.Exam
{
public class ParseError
{
public ParseErrorType Type { get; }
public string Message { get; }
public int? Index { get; }
public string MatchedText { get; }
public Exception InnerException { get; }
public ParseError(ParseErrorType type, string message, int? index = null, string matchedText = null, Exception innerException = null)
{
Type = type;
Message = message;
Index = index;
MatchedText = matchedText;
InnerException = innerException;
}
public override string ToString()
{
var sb = new System.Text.StringBuilder();
sb.Append($"[{Type}] {Message}");
if (Index.HasValue) sb.Append($" (Index: {Index.Value})");
if (!string.IsNullOrEmpty(MatchedText)) sb.Append($" (MatchedText: '{MatchedText}')");
if (InnerException != null) sb.Append($" InnerException: {InnerException.Message}");
return sb.ToString();
}
}
public enum ParseErrorType
{
Validation = 1, // 输入验证失败
DataParsing = 2, // 数据解析失败(如数字转换)
Structural = 3, // 结构性问题(如选项没有对应的问题)
RegexMatchIssue = 4, // 正则表达式匹配结果不符合预期
UnexpectedError = 5 // 未预料到的通用错误
}
public class ExamPaper
{
public string AssignmentTitle { get; set; } = "未识别试卷标题";
public string Description { get; set; } = "未识别试卷描述";
public string SubjectArea { get; set; } = "试卷类别";
public List<MajorQuestionGroup> QuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<PaperQuestion> TopLevelQuestions { get; set; } = new List<PaperQuestion>();
public List<ParseError> Errors { get; set; } = new List<ParseError>();
}
public class MajorQuestionGroup
{
public string Title { get; set; } = string.Empty;
public string Descript { get; set; } = string.Empty;
public float Score { get; set; }
public List<MajorQuestionGroup> SubQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<PaperQuestion> SubQuestions { get; set; } = new List<PaperQuestion>();
public int Priority { get; set; }
public bool bGroup { get; set; } = true;
}
public class PaperQuestion
{
public string Number { get; set; } = string.Empty;
public string Stem { get; set; } = string.Empty;
public float Score { get; set; }
public List<Option> Options { get; set; } = new List<Option>();
public List<PaperQuestion> SubQuestions { get; set; } = new List<PaperQuestion>();
public string SampleAnswer { get; set; } = string.Empty;
public string QuestionType { get; set; } = string.Empty;
public int Priority { get; set; }
}
public class Option
{
public string Label { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
}
/// <summary>
/// 表示一个带有优先级的正则表达式配置
/// </summary>
public class RegexPatternConfig
{
public string Pattern { get; set; } // 正则表达式字符串
public int Priority { get; set; } // 优先级,数字越小优先级越高
public Regex Regex { get; private set; } // 编译后的Regex对象用于性能优化
public RegexPatternConfig(string pattern, int priority)
{
Pattern = pattern;
Priority = priority;
Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled); // 多行模式,编译以提高性能
}
}
public enum ExamParserEnum
{
MajorQuestionGroupPatterns = 0,
QuestionPatterns,
OptionPatterns
}
/// <summary>
/// 试卷解析的配置类,包含所有正则表达式
/// </summary>
public class ExamParserConfig
{
public List<RegexPatternConfig> MajorQuestionGroupPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();
public ExamParserConfig()
{
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^([一二三四五六七八九十]+)[、\.]\s*(.+?)(?:\s*\(((\d+(?:\.\d+)?))\s*分\))?\s*$", 1));
QuestionPatterns.Add(new RegexPatternConfig(@"^\(([一二三四五六七八九十]{1,2}|十[一二三四五六七八九])\)\s*(.+?)(?:\s*\(((\d+(?:\.\d+)?))\s*分\))?\s*$", 1));
// 模式 1: "1. 这是一个题目 (5分)" 或 "1. 这是一个题目"
QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.\s*(.+?)(?:\s*\(((\d+(?:\.\d+)?))\s*分\))?\s*$", 2));
// 模式 2: "(1) 这是一个子题目 (3分)" 或 "(1) 这是一个子题目"
QuestionPatterns.Add(new RegexPatternConfig(@"^\((\d+)\)\s*(.+?)(?:\s*\(((\d+(?:\.\d+)?))\s*分\))?\s*$", 3));
// 模式 3: "① 这是一个更深层次的子题目 (2分)" 或 "① 这是一个更深层次的子题目"
QuestionPatterns.Add(new RegexPatternConfig(@"^[①②③④⑤⑥⑦⑧⑨⑩]+\s*(.+?)(?:\s*\(((\d+(?:\.\d+)?))\s*分\))?\s*$", 4));
OptionPatterns.Add(new RegexPatternConfig(@"([A-Z]\.)\s*(.*?)(?=[A-Z]\.|$)", 1)); // 大写字母选项
OptionPatterns.Add(new RegexPatternConfig(@"([a-z]\.)\s*(.*?)(?=[a-z]\.|$)", 1)); // 小写字母选项
}
}
public class PotentialMatch
{
public int StartIndex { get; set; }
public int EndIndex { get; set; } // 匹配到的结构在原始文本中的结束位置
public string MatchedText { get; set; } // 匹配到的完整行或段落
public Match RegexMatch { get; set; } // 原始的Regex.Match对象方便获取捕获组
public RegexPatternConfig PatternConfig { get; set; } // 匹配到的模式配置
public MatchType Type { get; set; } // 枚举MajorQuestionGroup, Question, Option, etc.
}
public enum MatchType
{
MajorQuestionGroup,
Question,
Option,
Other // 如果有其他需要识别的类型
}
/// <summary>
/// 负责扫描原始文本,收集所有潜在的匹配项(题组、题目、选项)。
/// 它只进行匹配,不进行结构化归属。
/// </summary>
public class ExamDocumentScanner
{
private readonly ExamParserConfig _config;
public ExamDocumentScanner(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config)); // 确保配置不为空
}
/// <summary>
/// 扫描给定的文本,返回所有潜在的匹配项,并按起始位置排序。
/// </summary>
/// <param name="text">要扫描的文本</param>
/// <returns>所有匹配到的 PotentialMatch 列表</returns>
public List<PotentialMatch> Scan(string text)
{
if (string.IsNullOrEmpty(text))
{
return new List<PotentialMatch>(); // 对于空文本,直接返回空列表
}
var allPotentialMatches = new List<PotentialMatch>();
// 扫描所有题组模式
foreach (var patternConfig in _config.MajorQuestionGroupPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.MajorQuestionGroup
});
}
}
// 扫描所有题目模式
foreach (var patternConfig in _config.QuestionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Question
});
}
}
// 扫描所有选项模式
foreach (var patternConfig in _config.OptionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Option
});
}
}
// 统一按起始位置排序
return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
}
}
public class ExamStructureBuilder
{
private readonly ExamParserConfig _config;
public ExamStructureBuilder(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config), "ExamParserConfig cannot be null.");
}
///
/// 一.基础
/// 1.听写
/// 2.阅读
/// 二.提升
/// 1.阅读
/// (1).选择
/// (2).填空
/// 三.写
/// (一)课文
///
///
///
/// <summary>
/// Builds the ExamPaper structure from raw text and potential matches.
/// Collects and returns parsing errors encountered during the process.
/// </summary>
/// <param name="fullExamText">The complete text of the exam paper.</param>
/// <param name="allPotentialMatches">A list of all identified potential matches.</param>
/// <returns>An ExamPaper object containing the parsed structure and a list of errors.</returns>
/// <exception cref="ArgumentException">Thrown if fullExamText is null or empty.</exception>
/// <exception cref="ArgumentNullException">Thrown if allPotentialMatches is null.</exception>
public ExamPaper BuildExam(string fullExamText, List<PotentialMatch> allPotentialMatches)
{
// 核心输入验证仍然是必要的,因为这些错误是无法恢复的
if (string.IsNullOrWhiteSpace(fullExamText))
{
throw new ArgumentException("Full exam text cannot be null or empty.", nameof(fullExamText));
}
if (allPotentialMatches == null)
{
throw new ArgumentNullException(nameof(allPotentialMatches), "Potential matches list cannot be null.");
}
var examPaper = new ExamPaper(); // ExamPaper 现在包含一个 Errors 列表
// 尝试获取试卷标题
try
{
examPaper.AssignmentTitle = GetExamTitle(fullExamText);
}
catch (Exception ex)
{
// 如果获取标题失败,记录错误而不是抛出致命异常
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError, "Failed to extract exam title.", innerException: ex));
examPaper.AssignmentTitle = "未识别试卷标题"; // 提供默认值
}
var majorQGStack = new Stack<MajorQuestionGroup>();
MajorQuestionGroup currentMajorQG = null;
var questionStack = new Stack<PaperQuestion>();
PaperQuestion currentQuestion = null;
int currentContentStart = 0;
// 处理试卷开头的描述性文本
if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
{
string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
if (!string.IsNullOrWhiteSpace(introText))
{
examPaper.Description += (string.IsNullOrWhiteSpace(examPaper.Description) ? "" : "\n") + introText;
}
}
currentContentStart = allPotentialMatches[0].StartIndex;
for (int i = 0; i < allPotentialMatches.Count; i++)
{
var pm = allPotentialMatches[i];
try
{
// **数据验证:不再抛出,而是记录错误**
if (pm.StartIndex < currentContentStart || pm.EndIndex > fullExamText.Length || pm.StartIndex > pm.EndIndex)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.Validation,
$"PotentialMatch at index {i} has invalid start/end indices. Start: {pm.StartIndex}, End: {pm.EndIndex}, CurrentContentStart: {currentContentStart}, FullTextLength: {fullExamText.Length}",
index: i, matchedText: pm.MatchedText));
currentContentStart = Math.Max(currentContentStart, pm.EndIndex); // 尝试跳过这个损坏的匹配项
continue; // 跳过当前循环迭代,处理下一个匹配项
}
if (pm.RegexMatch == null || pm.PatternConfig == null)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.Validation,
$"PotentialMatch at index {i} is missing RegexMatch or PatternConfig.",
index: i, matchedText: pm.MatchedText));
currentContentStart = Math.Max(currentContentStart, pm.EndIndex); // 尝试跳过这个损坏的匹配项
continue; // 跳过当前循环迭代,处理下一个匹配项
}
string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(precedingText))
{
if (currentQuestion != null)
{
// 将 examPaper.Errors 传递给 ProcessQuestionContent 收集错误
ProcessQuestionContent(currentQuestion, precedingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, pm.StartIndex, examPaper.Errors),
examPaper.Errors);
}
else if (currentMajorQG != null)
{
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + precedingText;
}
else
{
examPaper.Description += (string.IsNullOrWhiteSpace(examPaper.Description) ? "" : "\n") + precedingText;
}
}
if (pm.Type == MatchType.MajorQuestionGroup)
{
// 对 MajorQuestionGroup 的处理
try
{
while (majorQGStack.Any() && pm.PatternConfig.Priority <= majorQGStack.Peek().Priority)
{
majorQGStack.Pop();
}
// RegexMatch Groups 验证:不再抛出,记录错误
if (pm.RegexMatch.Groups.Count < 2 || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[1].Value))
{
examPaper.Errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"MajorQuestionGroup match at index {i} does not have enough regex groups or a valid title group (Group 1). Skipping this group.",
index: i, matchedText: pm.MatchedText));
currentContentStart = pm.EndIndex; // 继续,尝试跳过此项
continue;
}
float score = 0;
// 使用 float.TryParse 避免异常
if (pm.RegexMatch.Groups.Count > 3 && pm.RegexMatch.Groups[4].Success) // 假设纯数字分数是 Group 4
{
if (!float.TryParse(pm.RegexMatch.Groups[4].Value, out score))
{
examPaper.Errors.Add(new ParseError(ParseErrorType.DataParsing,
$"Failed to parse score '{pm.RegexMatch.Groups[4].Value}' for MajorQuestionGroup at index {i}. Defaulting to 0.",
index: i, matchedText: pm.MatchedText));
}
}
MajorQuestionGroup newMajorQG = new MajorQuestionGroup
{
Title = pm.RegexMatch.Groups[2].Value.Trim(), // 标题是 Group 2
Score = score,
Priority = pm.PatternConfig.Priority,
bGroup = true
};
if (majorQGStack.Any())
{
majorQGStack.Peek().SubQuestionGroups.Add(newMajorQG);
}
else
{
examPaper.QuestionGroups.Add(newMajorQG);
}
currentContentStart = pm.EndIndex;
majorQGStack.Push(newMajorQG);
currentMajorQG = newMajorQG;
questionStack.Clear();
currentQuestion = null;
}
catch (Exception ex)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing MajorQuestionGroup at index {i}.",
index: i, matchedText: pm.MatchedText, innerException: ex));
currentContentStart = pm.EndIndex; // 尝试跳过此项
continue;
}
}
else if (pm.Type == MatchType.Question)
{
// 对 Question 的处理
try
{
while (questionStack.Any() && pm.PatternConfig.Priority <= questionStack.Peek().Priority)
{
questionStack.Pop();
}
// RegexMatch Groups 验证
if (pm.RegexMatch.Groups.Count < 3 || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[1].Value) || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
{
examPaper.Errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"Question match at index {i} does not have enough regex groups or valid number/text groups (Group 1/2). Skipping this question.",
index: i, matchedText: pm.MatchedText));
currentContentStart = pm.EndIndex; // 尝试跳过此项
continue;
}
float score = 0;
// 使用 float.TryParse 避免异常
if (pm.RegexMatch.Groups.Count > 4 && pm.RegexMatch.Groups[4].Success) // 假设纯数字分数是 Group 4
{
if (!float.TryParse(pm.RegexMatch.Groups[4].Value, out score))
{
examPaper.Errors.Add(new ParseError(ParseErrorType.DataParsing,
$"Failed to parse score '{pm.RegexMatch.Groups[4].Value}' for Question at index {i}. Defaulting to 0.",
index: i, matchedText: pm.MatchedText));
}
}
PaperQuestion newQuestion = new PaperQuestion
{
Number = pm.RegexMatch.Groups[1].Value.Trim(),
Stem = pm.RegexMatch.Groups[2].Value.Trim(),
Priority = pm.PatternConfig.Priority,
Score = score // 赋值解析到的分数
};
if (questionStack.Any())
{
questionStack.Peek().SubQuestions.Add(newQuestion);
}
else if (currentMajorQG != null)
{
currentMajorQG.SubQuestions.Add(newQuestion);
}
else
{
examPaper.TopLevelQuestions.Add(newQuestion);
}
currentContentStart = pm.EndIndex;
questionStack.Push(newQuestion);
currentQuestion = newQuestion;
}
catch (Exception ex)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing Question at index {i}.",
index: i, matchedText: pm.MatchedText, innerException: ex));
currentContentStart = pm.EndIndex; // 尝试跳过此项
continue;
}
}
else if (pm.Type == MatchType.Option)
{
// 对 Option 的处理
try
{
if (currentQuestion != null)
{
// RegexMatch Groups 验证
if (pm.RegexMatch.Groups.Count < 3 || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[1].Value) || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
{
examPaper.Errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"Option match at index {i} does not have enough regex groups or valid label/text groups (Group 1/2). Skipping this option.",
index: i, matchedText: pm.MatchedText));
currentContentStart = pm.EndIndex; // 尝试跳过此项
continue;
}
Option newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
currentQuestion.Options.Add(newOption);
}
else
{
// 结构性问题:找到孤立的选项,记录错误但继续
examPaper.Errors.Add(new ParseError(ParseErrorType.Structural,
$"Found isolated Option at index {i}. Options must belong to a question. Ignoring this option.",
index: i, matchedText: pm.MatchedText));
}
}
catch (Exception ex)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing Option at index {i}.",
index: i, matchedText: pm.MatchedText, innerException: ex));
// 这里不需要 `continue`,因为即使出错也可能只是该选项的问题,不影响后续处理
}
}
currentContentStart = pm.EndIndex; // 更新当前内容起点
}
catch (Exception ex)
{
// 捕获任何在处理单个 PotentialMatch 过程中未被更具体 catch 块捕获的意外错误
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during main loop processing of PotentialMatch at index {i}.",
index: i, matchedText: pm.MatchedText, innerException: ex));
currentContentStart = Math.Max(currentContentStart, pm.EndIndex); // 尝试跳过当前匹配项,继续下一项
// 这里不 `continue` 是因为外层循环会推进 `i`,但确保 `currentContentStart` 更新以避免无限循环
}
}
// --- 处理所有匹配项之后的剩余内容 ---
if (currentContentStart < fullExamText.Length)
{
try
{
string remainingText = fullExamText.Substring(currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(remainingText))
{
if (currentQuestion != null)
{
ProcessQuestionContent(currentQuestion, remainingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, fullExamText.Length, examPaper.Errors),
examPaper.Errors);
}
else if (currentMajorQG != null)
{
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + remainingText;
}
else
{
examPaper.Description += (string.IsNullOrWhiteSpace(examPaper.Description) ? "" : "\n") + remainingText;
}
}
}
catch (Exception ex)
{
examPaper.Errors.Add(new ParseError(ParseErrorType.UnexpectedError,
"An unexpected error occurred while processing remaining text after all potential matches.",
innerException: ex));
}
}
return examPaper;
}
/// <summary>
/// Extracts the exam title (simple implementation).
/// Logs errors to the provided error list instead of throwing.
/// </summary>
private string GetExamTitle(string examPaperText)
{
// 内部不再直接抛出异常,而是让外部的 try-catch 负责
var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
return firstLine ?? "未识别试卷标题";
}
/// <summary>
/// Gets a subset of the given PotentialMatch list within a specified range.
/// Logs errors to the provided error list instead of throwing.
/// </summary>
private List<PotentialMatch> GetSubMatchesForRange(List<PotentialMatch> allMatches, int start, int end, List<ParseError> errors)
{
// 输入验证,如果输入错误,记录错误并返回空列表
if (start < 0 || end < start)
{
errors.Add(new ParseError(ParseErrorType.Validation,
$"Invalid range provided to GetSubMatchesForRange. Start: {start}, End: {end}.",
index: start)); // 使用 start 作为大概索引
return new List<PotentialMatch>();
}
// allMatches 为 null 的情况已经在 BuildExamPaper 顶部处理,这里为了方法的健壮性可以再加一次检查
if (allMatches == null)
{
return new List<PotentialMatch>();
}
try
{
return allMatches.Where(pm => pm.StartIndex >= start && pm.StartIndex < end).ToList();
}
catch (Exception ex)
{
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred getting sub-matches for range [{start}, {end}).",
innerException: ex));
return new List<PotentialMatch>(); // 出错时返回空列表
}
}
/// <summary>
/// Processes the content of a Question, mainly for parsing Options and identifying unstructured text.
/// Logs errors to the provided error list instead of throwing.
/// </summary>
private void ProcessQuestionContent(PaperQuestion question, string contentText, List<PotentialMatch> potentialMatchesInScope, List<ParseError> errors)
{
// 参数验证,这些是内部方法的契约,如果违反则直接抛出,因为这意味着调用者有错
if (question == null) throw new ArgumentNullException(nameof(question), "Question cannot be null in ProcessQuestionContent.");
if (contentText == null) throw new ArgumentNullException(nameof(contentText), "Content text cannot be null in ProcessQuestionContent.");
if (potentialMatchesInScope == null) throw new ArgumentNullException(nameof(potentialMatchesInScope), "Potential matches in scope cannot be null.");
try
{
int lastOptionEndIndex = 0;
foreach (var pm in potentialMatchesInScope.OrderBy(p => p.StartIndex))
{
// 对每个匹配项的内部处理,记录错误但继续
try
{
if (pm.Type == MatchType.Option)
{
// 验证索引,记录错误但继续
if (pm.StartIndex < lastOptionEndIndex || pm.StartIndex > contentText.Length || pm.EndIndex > contentText.Length)
{
errors.Add(new ParseError(ParseErrorType.Validation,
$"Option match at index {pm.StartIndex} has invalid indices within content text. MatchedText: '{pm.MatchedText}'. Skipping.",
index: pm.StartIndex, matchedText: pm.MatchedText));
continue; // 跳过当前选项
}
// 处理选项前的文本
if (pm.StartIndex > lastOptionEndIndex)
{
string textBeforeOption = contentText.Substring(lastOptionEndIndex, pm.StartIndex - lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(textBeforeOption))
{
question.Stem += (string.IsNullOrWhiteSpace(question.Stem) ? "" : "\n") + textBeforeOption;
}
}
// RegexMatch Groups 验证,记录错误但继续
if (pm.RegexMatch.Groups.Count < 3 || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[1].Value) || string.IsNullOrWhiteSpace(pm.RegexMatch.Groups[2].Value))
{
errors.Add(new ParseError(ParseErrorType.RegexMatchIssue,
$"Option regex match '{pm.MatchedText}' does not have enough groups (expected 3) for label and text. Skipping option.",
index: pm.StartIndex, matchedText: pm.MatchedText));
lastOptionEndIndex = pm.EndIndex; // 更新索引,避免卡死
continue; // 跳过当前选项
}
var newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
question.Options.Add(newOption);
lastOptionEndIndex = pm.EndIndex;
}
else
{
question.Stem += contentText;
}
}
catch (Exception innerEx)
{
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred during processing a potential match ({pm.Type}) within question content.",
index: pm.StartIndex, matchedText: pm.MatchedText, innerException: innerEx));
lastOptionEndIndex = pm.EndIndex; // 尝试更新索引,避免无限循环
continue; // 尝试继续下一个匹配项
}
}
// 处理所有选项之后的剩余文本
if (lastOptionEndIndex < contentText.Length)
{
string remainingContent = contentText.Substring(lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(remainingContent))
{
question.Stem += (string.IsNullOrWhiteSpace(question.Stem) ? "" : "\n") + remainingContent;
}
}
}
catch (Exception ex)
{
// 捕获 ProcessQuestionContent 整个方法内部的意外错误
errors.Add(new ParseError(ParseErrorType.UnexpectedError,
$"An unexpected error occurred while processing content for Question '{question.Number}'.",
innerException: ex));
}
}
}
public class ExamParser
{
private readonly ExamParserConfig _config;
private readonly ExamDocumentScanner _scanner;
private readonly ExamStructureBuilder _builder;
public ExamParser(ExamParserConfig config)
{
_config = config ?? throw new ArgumentNullException(nameof(config));
_scanner = new ExamDocumentScanner(_config);
_builder = new ExamStructureBuilder(_config);
}
/// <summary>
/// 解析给定的试卷文本,返回结构化的 ExamPaper 对象。
/// </summary>
/// <param name="examPaperText">完整的试卷文本</param>
/// <returns>解析后的 ExamPaper 对象</returns>
public ExamPaper ParseExamPaper(string examPaperText)
{
// 1. 扫描:一次性扫描整个文本,收集所有潜在的匹配项
// Scan 方法现在已经优化为不抛出 ArgumentNullException
List<PotentialMatch> allPotentialMatches = _scanner.Scan(examPaperText);
// 2. 构建:根据扫描结果和原始文本,线性遍历并构建层级结构
// BuildExamPaper 现在会返回一个包含错误列表的 ExamPaper 对象
// 外部不再需要捕获内部解析异常,只需检查 ExamPaper.Errors 列表
return _builder.BuildExam(examPaperText, allPotentialMatches);
}
}
}