Files
TechHelper/TechHelper.Client/Exam/ExamParse.cs
SpecialX e824c081bf change
2025-05-30 12:46:55 +08:00

429 lines
14 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System.Text.RegularExpressions;
namespace TechHelper.Client.Exam.Parse
{
public class ExamPaper
{
public string Title { get; set; } = "未识别试卷标题";
public string Descript { get; set; } = "未识别试卷描述";
public string SubjectArea { get; set; } = "试卷类别";
public List<MajorQuestionGroup> MajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<Question> TopLevelQuestions { get; set; } = new List<Question>();
}
public class MajorQuestionGroup
{
public string Title { get; set; } = string.Empty;
public string Descript { get; set; } = string.Empty;
public float Score { get; set; }
public List<MajorQuestionGroup> SubMajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
public List<Question> Questions { get; set; } = new List<Question>();
public int Priority { get; set; }
}
public class Question
{
public string Number { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
public float Score { get; set; }
public List<Option> Options { get; set; } = new List<Option>();
public List<Question> SubQuestions { get; set; } = new List<Question>();
public int Priority { get; set; }
}
public class Option
{
public string Label { get; set; } = string.Empty;
public string Text { get; set; } = string.Empty;
}
/// <summary>
/// 表示一个带有优先级的正则表达式配置
/// </summary>
public class RegexPatternConfig
{
public string Pattern { get; set; } // 正则表达式字符串
public int Priority { get; set; } // 优先级,数字越小优先级越高
public Regex Regex { get; private set; } // 编译后的Regex对象用于性能优化
public RegexPatternConfig(string pattern, int priority)
{
Pattern = pattern;
Priority = priority;
Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled); // 多行模式,编译以提高性能
}
}
/// <summary>
/// 试卷解析的配置类,包含所有正则表达式
/// </summary>
public class ExamParserConfig
{
public List<RegexPatternConfig> MajorQuestionGroupPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();
public ExamParserConfig()
{
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^[一二三四五六七八九十]+\s*[、.]\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 1)); // 如: 一、选择题 (5分)
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^\d+\.\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 2)); // 如: 1. 填空题 (10分)
MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^(\(.+\))\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 3)); // 如: (一) 文言文阅读 (8分)
QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.\s*(.*)$", 1)); // 如: 1. 题干
OptionPatterns.Add(new RegexPatternConfig(@"^[A-D]\.\s*(.*)$", 1)); // 如: A. 选项内容
}
}
public class PotentialMatch
{
public int StartIndex { get; set; }
public int EndIndex { get; set; } // 匹配到的结构在原始文本中的结束位置
public string MatchedText { get; set; } // 匹配到的完整行或段落
public Match RegexMatch { get; set; } // 原始的Regex.Match对象方便获取捕获组
public RegexPatternConfig PatternConfig { get; set; } // 匹配到的模式配置
public MatchType Type { get; set; } // 枚举MajorQuestionGroup, Question, Option, etc.
}
public enum MatchType
{
MajorQuestionGroup,
Question,
Option,
Other // 如果有其他需要识别的类型
}
/// <summary>
/// 负责扫描原始文本,收集所有潜在的匹配项(题组、题目、选项)。
/// 它只进行匹配,不进行结构化归属。
/// </summary>
public class ExamDocumentScanner
{
private readonly ExamParserConfig _config;
public ExamDocumentScanner(ExamParserConfig config)
{
_config = config;
}
/// <summary>
/// 扫描给定的文本,返回所有潜在的匹配项,并按起始位置排序。
/// </summary>
/// <param name="text">要扫描的文本</param>
/// <returns>所有匹配到的 PotentialMatch 列表</returns>
public List<PotentialMatch> Scan(string text)
{
var allPotentialMatches = new List<PotentialMatch>();
// 扫描所有题组模式
foreach (var patternConfig in _config.MajorQuestionGroupPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.MajorQuestionGroup
});
}
}
// 扫描所有题目模式
foreach (var patternConfig in _config.QuestionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Question
});
}
}
// 扫描所有选项模式
foreach (var patternConfig in _config.OptionPatterns)
{
foreach (Match match in patternConfig.Regex.Matches(text))
{
allPotentialMatches.Add(new PotentialMatch
{
StartIndex = match.Index,
EndIndex = match.Index + match.Length,
MatchedText = match.Value,
RegexMatch = match,
PatternConfig = patternConfig,
Type = MatchType.Option
});
}
}
// 统一按起始位置排序
return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
}
}
public class ExamStructureBuilder
{
private readonly ExamParserConfig _config;
public ExamStructureBuilder(ExamParserConfig config)
{
_config = config;
}
public ExamPaper BuildExamPaper(string fullExamText, List<PotentialMatch> allPotentialMatches)
{
var examPaper = new ExamPaper();
examPaper.Title = GetExamTitle(fullExamText);
var majorQGStack = new Stack<MajorQuestionGroup>();
MajorQuestionGroup currentMajorQG = null;
var questionStack = new Stack<Question>();
Question currentQuestion = null;
int currentContentStart = 0;
if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
{
string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
// 可以选择将这部分文本存储到 ExamPaper 的某个属性,例如 ExamPaper.Description
}
// 这里需要处理currentContentStart的位置,到allPotentialMatches[0].StartIndex
for (int i = 0; i < allPotentialMatches.Count; i++)
{
var pm = allPotentialMatches[i];
string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(precedingText))
{
if (currentQuestion != null)
{
ProcessQuestionContent(currentQuestion, precedingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, pm.StartIndex));
}
else if (currentMajorQG != null)
{
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + precedingText;
}
else
{
// 暂时忽略,或可以添加到 ExamPaper.Description
}
}
if (pm.Type == MatchType.MajorQuestionGroup)
{
// 1. 确定当前 MajorQuestionGroup 的层级关系
while (majorQGStack.Any() && pm.PatternConfig.Priority <= majorQGStack.Peek().Priority)
{
// 当前 QG 的优先级等于或高于栈顶 QG说明栈顶 QG 已经结束
majorQGStack.Pop();
}
MajorQuestionGroup newMajorQG = new MajorQuestionGroup
{
Title = pm.RegexMatch.Groups[1].Value.Trim(),
Score = (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success) ? float.Parse(pm.RegexMatch.Groups[2].Value) : 0,
Priority = pm.PatternConfig.Priority
};
if (majorQGStack.Any())
{
majorQGStack.Peek().SubMajorQuestionGroups.Add(newMajorQG);
}
else
{
examPaper.MajorQuestionGroups.Add(newMajorQG);
}
majorQGStack.Push(newMajorQG);
currentMajorQG = newMajorQG;
questionStack.Clear();
currentQuestion = null;
}
else if (pm.Type == MatchType.Question)
{
// 1. 确定当前 Question 的层级关系(子题目)
// 找到比当前 Question 优先级高或相等的 Question 作为其父级
while (questionStack.Any() && pm.PatternConfig.Priority <= questionStack.Peek().Priority)
{
// 如果当前 Question 的优先级等于或高于栈顶 Question说明栈顶 Question 已经结束
questionStack.Pop();
}
Question newQuestion = new Question
{
Number = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim(),
Priority = pm.PatternConfig.Priority
};
if (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success)
{
float.TryParse(pm.RegexMatch.Groups[2].Value, out float score);
newQuestion.Score = score;
}
if (questionStack.Any())
{
questionStack.Peek().SubQuestions.Add(newQuestion);
}
else if (currentMajorQG != null)
{
// 归属于当前活跃的 MajorQuestionGroup
currentMajorQG.Questions.Add(newQuestion);
}
else
{
// 没有活跃的 MajorQuestionGroup 或 Question作为 ExamPaper 的顶级 Questions
examPaper.TopLevelQuestions.Add(newQuestion);
}
questionStack.Push(newQuestion); // 新的 Question 入栈,成为当前活跃 Question
currentQuestion = newQuestion;
}
else if (pm.Type == MatchType.Option)
{
// 选项必须归属于一个题目
if (currentQuestion != null)
{
Option newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
currentQuestion.Options.Add(newOption);
}
else
{
// 孤立的选项,可能需要日志记录或错误处理
Console.WriteLine($"Warning: Found isolated Option at index {pm.StartIndex}: {pm.MatchedText}");
}
}
// --- 步骤3: 更新 currentContentStart 为当前匹配点的 EndIndex ---
// 下一次循环将从这里开始提取内容
currentContentStart = pm.EndIndex;
}
// --- 步骤4: 处理循环结束后,最后一个匹配点之后到文本末尾的剩余内容 ---
if (currentContentStart < fullExamText.Length)
{
string remainingText = fullExamText.Substring(currentContentStart).Trim();
if (!string.IsNullOrWhiteSpace(remainingText))
{
if (currentQuestion != null)
{
// 最后一个题目后面的内容(可能是选项或多行描述)
ProcessQuestionContent(currentQuestion, remainingText,
GetSubMatchesForRange(allPotentialMatches, currentContentStart, fullExamText.Length));
}
else if (currentMajorQG != null)
{
// 最后一个题组后面的内容(可能是描述或题目)
currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + remainingText;
}
else
{
// 顶级剩余文本,可能作为 ExamPaper 的整体描述
// examPaper.Description += remainingText;
}
}
}
return examPaper;
}
/// <summary>
/// 提取试卷标题 (简单实现)
/// </summary>
private string GetExamTitle(string examPaperText)
{
var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
return firstLine ?? "未识别试卷标题";
}
/// <summary>
/// 获取给定 PotentialMatch 列表在指定范围内的子集。
/// 这个方法用于辅助 ProcessQuestionContent为其提供该范围内的 Options 和 SubQuestions。
/// </summary>
private List<PotentialMatch> GetSubMatchesForRange(List<PotentialMatch> allMatches, int start, int end)
{
// 注意:这里需要考虑 potentialMatches 的索引与 fullExamText 索引的映射
// 这里的 StartIndex 是相对于 fullExamText 的
return allMatches.Where(pm => pm.StartIndex >= start && pm.StartIndex < end).ToList();
}
/// <summary>
/// 处理 Question 的内容,主要用于解析 Options 和识别非结构化文本。
/// </summary>
private void ProcessQuestionContent(Question question, string contentText, List<PotentialMatch> potentialMatchesInScope)
{
// 遍历当前范围内的所有 PotentialMatch找出 Options
var optionsText = new System.Text.StringBuilder();
int lastOptionEndIndex = 0; // 记录最后一个处理的选项的结束位置
foreach (var pm in potentialMatchesInScope.OrderBy(p => p.StartIndex))
{
// 检查是否是选项
if (pm.Type == MatchType.Option)
{
// 收集选项之间的文本作为题干的延续或描述
if (pm.StartIndex > lastOptionEndIndex)
{
string textBeforeOption = contentText.Substring(lastOptionEndIndex, pm.StartIndex - lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(textBeforeOption))
{
question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + textBeforeOption;
}
}
var newOption = new Option
{
Label = pm.RegexMatch.Groups[1].Value.Trim(),
Text = pm.RegexMatch.Groups[2].Value.Trim()
};
question.Options.Add(newOption);
lastOptionEndIndex = pm.EndIndex;
}
// TODO: 如果有 SubQuestion 类型,在这里也可以类似处理
// else if (pm.Type == MatchType.Question && pm.PatternConfig.Priority > question.Priority)
// {
// // 这是一个子题目,需要进一步解析
// // 递归调用但这里的逻辑会更复杂因为需要识别子题目自己的Options
// // 可能会在这里创建一个临时的 Question然后递归 ProcessQuestionContent
// }
}
// 处理所有选项之后剩余的文本
if (lastOptionEndIndex < contentText.Length)
{
string remainingContent = contentText.Substring(lastOptionEndIndex).Trim();
if (!string.IsNullOrWhiteSpace(remainingContent))
{
question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + remainingContent;
}
}
}
}
}