update-examParseByText

This commit is contained in:
SpecialX
2025-05-30 12:48:43 +08:00
parent e824c081bf
commit 97843ab5fd
4 changed files with 713 additions and 429 deletions

View File

@@ -1,429 +0,0 @@
using System.Text.RegularExpressions;
namespace TechHelper.Client.Exam.Parse
{
public class ExamPaper
{
public string Title { get; set; } = "未识别试卷标题";
public string Descript { get; set; } = "未识别试卷描述";
public string SubjectArea { get; set; } = "试卷类别";
public List<MajorQuestionGroup> MajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<Question> TopLevelQuestions { get; set; } = new List<Question>();
}
public class MajorQuestionGroup
{
public string Title { get; set; } = string.Empty;
public string Descript { get; set; } = string.Empty;
public float Score { get; set; }
public List<MajorQuestionGroup> SubMajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<Question> Questions { get; set; } = new List<Question>();
public int Priority { get; set; }
}
public class Question
{
public string Number { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
public float Score { get; set; }
public List<Option> Options { get; set; } = new List<Option>();
public List<Question> SubQuestions { get; set; } = new List<Question>();
public int Priority { get; set; }
}
public class Option
{
public string Label { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
}
/// <summary>
/// 表示一个带有优先级的正则表达式配置
/// </summary>
public class RegexPatternConfig
{
public string Pattern { get; set; } // 正则表达式字符串
public int Priority { get; set; } // 优先级,数字越小优先级越高
public Regex Regex { get; private set; } // 编译后的Regex对象用于性能优化
public RegexPatternConfig(string pattern, int priority)
{
Pattern = pattern;
Priority = priority;
Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled); // 多行模式,编译以提高性能
}
}
/// <summary>
/// 试卷解析的配置类,包含所有正则表达式
/// </summary>
public class ExamParserConfig
{
public List<RegexPatternConfig> MajorQuestionGroupPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();
public ExamParserConfig()
{
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^[一二三四五六七八九十]+\s*[、.]\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 1)); // 如: 一、选择题 (5分)
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^\d+\.\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 2)); // 如: 1. 填空题 (10分)
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^(\(.+\))\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 3)); // 如: (一) 文言文阅读 (8分)
QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.\s*(.*)$", 1)); // 如: 1. 题干
OptionPatterns.Add(new RegexPatternConfig(@"^[A-D]\.\s*(.*)$", 1)); // 如: A. 选项内容
}
}
public class PotentialMatch
{
public int StartIndex { get; set; }
public int EndIndex { get; set; } // 匹配到的结构在原始文本中的结束位置
public string MatchedText { get; set; } // 匹配到的完整行或段落
public Match RegexMatch { get; set; } // 原始的Regex.Match对象方便获取捕获组
public RegexPatternConfig PatternConfig { get; set; } // 匹配到的模式配置
public MatchType Type { get; set; } // 枚举MajorQuestionGroup, Question, Option, etc.
}
public enum MatchType
{
MajorQuestionGroup,
Question,
Option,
Other // 如果有其他需要识别的类型
}
/// <summary>
/// 负责扫描原始文本,收集所有潜在的匹配项(题组、题目、选项)。
/// 它只进行匹配,不进行结构化归属。
/// </summary>
public class ExamDocumentScanner
{
private readonly ExamParserConfig _config;
public ExamDocumentScanner(ExamParserConfig config)
{
_config = config;
}
/// <summary>
/// 扫描给定的文本,返回所有潜在的匹配项,并按起始位置排序。
/// </summary>
/// <param name="text">要扫描的文本</param>
/// <returns>所有匹配到的 PotentialMatch 列表</returns>
public List<PotentialMatch> Scan(string text)
{
var allPotentialMatches = new List<PotentialMatch>();
// 扫描所有题组模式
foreach (var patternConfig in _config.MajorQuestionGroupPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.MajorQuestionGroup
});
}
}
// 扫描所有题目模式
foreach (var patternConfig in _config.QuestionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Question
});
}
}
// 扫描所有选项模式
foreach (var patternConfig in _config.OptionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Option
});
}
}
// 统一按起始位置排序
return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
}
}
public class ExamStructureBuilder
{
private readonly ExamParserConfig _config;
public ExamStructureBuilder(ExamParserConfig config)
{
_config = config;
}
public ExamPaper BuildExamPaper(string fullExamText, List<PotentialMatch> allPotentialMatches)
{
var examPaper = new ExamPaper();
examPaper.Title = GetExamTitle(fullExamText);
var majorQGStack = new Stack<MajorQuestionGroup>();
MajorQuestionGroup currentMajorQG = null;
var questionStack = new Stack<Question>();
Question currentQuestion = null;
int currentContentStart = 0;
if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
{
string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
// 可以选择将这部分文本存储到 ExamPaper 的某个属性,例如 ExamPaper.Description
}
// 这里需要处理currentContentStart的位置,到allPotentialMatches[0].StartIndex
for (int i = 0; i < allPotentialMatches.Count; i++)
{
var pm = allPotentialMatches[i];
string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(precedingText))
{
if (currentQuestion != null)
{
ProcessQuestionContent(currentQuestion, precedingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, pm.StartIndex));
}
else if (currentMajorQG != null)
{
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + precedingText;
}
else
{
// 暂时忽略,或可以添加到 ExamPaper.Description
}
}
if (pm.Type == MatchType.MajorQuestionGroup)
{
// 1. 确定当前 MajorQuestionGroup 的层级关系
while (majorQGStack.Any() && pm.PatternConfig.Priority <= majorQGStack.Peek().Priority)
{
// 当前 QG 的优先级等于或高于栈顶 QG说明栈顶 QG 已经结束
majorQGStack.Pop();
}
MajorQuestionGroup newMajorQG = new MajorQuestionGroup
{
Title = pm.RegexMatch.Groups[1].Value.Trim(),
Score = (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success) ? float.Parse(pm.RegexMatch.Groups[2].Value) : 0,
Priority = pm.PatternConfig.Priority
};
if (majorQGStack.Any())
{
majorQGStack.Peek().SubMajorQuestionGroups.Add(newMajorQG);
}
else
{
examPaper.MajorQuestionGroups.Add(newMajorQG);
}
majorQGStack.Push(newMajorQG);
currentMajorQG = newMajorQG;
questionStack.Clear();
currentQuestion = null;
}
else if (pm.Type == MatchType.Question)
{
// 1. 确定当前 Question 的层级关系(子题目)
// 找到比当前 Question 优先级高或相等的 Question 作为其父级
while (questionStack.Any() && pm.PatternConfig.Priority <= questionStack.Peek().Priority)
{
// 如果当前 Question 的优先级等于或高于栈顶 Question说明栈顶 Question 已经结束
questionStack.Pop();
}
Question newQuestion = new Question
{
Number = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim(),
Priority = pm.PatternConfig.Priority
};
if (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success)
{
float.TryParse(pm.RegexMatch.Groups[2].Value, out float score);
newQuestion.Score = score;
}
if (questionStack.Any())
{
questionStack.Peek().SubQuestions.Add(newQuestion);
}
else if (currentMajorQG != null)
{
// 归属于当前活跃的 MajorQuestionGroup
currentMajorQG.Questions.Add(newQuestion);
}
else
{
// 没有活跃的 MajorQuestionGroup 或 Question作为 ExamPaper 的顶级 Questions
examPaper.TopLevelQuestions.Add(newQuestion);
}
questionStack.Push(newQuestion); // 新的 Question 入栈,成为当前活跃 Question
currentQuestion = newQuestion;
}
else if (pm.Type == MatchType.Option)
{
// 选项必须归属于一个题目
if (currentQuestion != null)
{
Option newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
currentQuestion.Options.Add(newOption);
}
else
{
// 孤立的选项,可能需要日志记录或错误处理
Console.WriteLine($"Warning: Found isolated Option at index {pm.StartIndex}: {pm.MatchedText}");
}
}
// --- 步骤3: 更新 currentContentStart 为当前匹配点的 EndIndex ---
// 下一次循环将从这里开始提取内容
currentContentStart = pm.EndIndex;
}
// --- 步骤4: 处理循环结束后,最后一个匹配点之后到文本末尾的剩余内容 ---
if (currentContentStart < fullExamText.Length)
{
string remainingText = fullExamText.Substring(currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(remainingText))
{
if (currentQuestion != null)
{
// 最后一个题目后面的内容(可能是选项或多行描述)
ProcessQuestionContent(currentQuestion, remainingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, fullExamText.Length));
}
else if (currentMajorQG != null)
{
// 最后一个题组后面的内容(可能是描述或题目)
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + remainingText;
}
else
{
// 顶级剩余文本,可能作为 ExamPaper 的整体描述
// examPaper.Description += remainingText;
}
}
}
return examPaper;
}
/// <summary>
/// 提取试卷标题 (简单实现)
/// </summary>
private string GetExamTitle(string examPaperText)
{
var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
return firstLine ?? "未识别试卷标题";
}
/// <summary>
/// 获取给定 PotentialMatch 列表在指定范围内的子集。
/// 这个方法用于辅助 ProcessQuestionContent为其提供该范围内的 Options 和 SubQuestions。
/// </summary>
private List<PotentialMatch> GetSubMatchesForRange(List<PotentialMatch> allMatches, int start, int end)
{
// 注意:这里需要考虑 potentialMatches 的索引与 fullExamText 索引的映射
// 这里的 StartIndex 是相对于 fullExamText 的
return allMatches.Where(pm => pm.StartIndex >= start && pm.StartIndex < end).ToList();
}
/// <summary>
/// 处理 Question 的内容,主要用于解析 Options 和识别非结构化文本。
/// </summary>
private void ProcessQuestionContent(Question question, string contentText, List<PotentialMatch> potentialMatchesInScope)
{
// 遍历当前范围内的所有 PotentialMatch找出 Options
var optionsText = new System.Text.StringBuilder();
int lastOptionEndIndex = 0; // 记录最后一个处理的选项的结束位置
foreach (var pm in potentialMatchesInScope.OrderBy(p => p.StartIndex))
{
// 检查是否是选项
if (pm.Type == MatchType.Option)
{
// 收集选项之间的文本作为题干的延续或描述
if (pm.StartIndex > lastOptionEndIndex)
{
string textBeforeOption = contentText.Substring(lastOptionEndIndex, pm.StartIndex - lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(textBeforeOption))
{
question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + textBeforeOption;
}
}
var newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
question.Options.Add(newOption);
lastOptionEndIndex = pm.EndIndex;
}
// TODO: 如果有 SubQuestion 类型,在这里也可以类似处理
// else if (pm.Type == MatchType.Question && pm.PatternConfig.Priority > question.Priority)
// {
// // 这是一个子题目,需要进一步解析
// // 递归调用但这里的逻辑会更复杂因为需要识别子题目自己的Options
// // 可能会在这里创建一个临时的 Question然后递归 ProcessQuestionContent
// }
}
// 处理所有选项之后剩余的文本
if (lastOptionEndIndex < contentText.Length)
{
string remainingContent = contentText.Substring(lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(remainingContent))
{
question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + remainingContent;
}
}
}
}
}