TechHelper/TechHelper.Client/Exam/ExamParse.cs

using System.Text.RegularExpressions;

namespace TechHelper.Client.Exam.Parse
{
	public class ExamPaper
	{
		public string Title { get; set; } = "未识别试卷标题";
		public string Descript { get; set; } = "未识别试卷描述";
		public string SubjectArea { get; set; } = "试卷类别";
		public List<MajorQuestionGroup> MajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
		public List<Question> TopLevelQuestions { get; set; } = new List<Question>();
	}

	public class MajorQuestionGroup
	{
		public string Title { get; set; } = string.Empty;
		public string Descript { get; set; } = string.Empty;
		public float Score { get; set; }
		public List<MajorQuestionGroup> SubMajorQuestionGroups { get; set; } = new List<MajorQuestionGroup>();
		public List<Question> Questions { get; set; } = new List<Question>();
		public int Priority { get; set; }
	}

	public class Question
	{
		public string Number { get; set; } = string.Empty;
		public string Text { get; set; } = string.Empty;
		public float Score { get; set; }
		public List<Option> Options { get; set; } = new List<Option>();
		public List<Question> SubQuestions { get; set; } = new List<Question>();
		public int Priority { get; set; }
	}

	public class Option
	{
		public string Label { get; set; } = string.Empty;
		public string Text { get; set; } = string.Empty;
	}


	/// <summary>
	/// 表示一个带有优先级的正则表达式配置
	/// </summary>
	public class RegexPatternConfig
	{
		public string Pattern { get; set; } // 正则表达式字符串
		public int Priority { get; set; }    // 优先级，数字越小优先级越高
		public Regex Regex { get; private set; } // 编译后的Regex对象，用于性能优化

		public RegexPatternConfig(string pattern, int priority)
		{
			Pattern = pattern;
			Priority = priority;
			Regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.Compiled); // 多行模式，编译以提高性能
		}
	}

	/// <summary>
	/// 试卷解析的配置类，包含所有正则表达式
	/// </summary>
	public class ExamParserConfig
	{
		public List<RegexPatternConfig> MajorQuestionGroupPatterns { get; set; } = new List<RegexPatternConfig>();
		public List<RegexPatternConfig> QuestionPatterns { get; set; } = new List<RegexPatternConfig>();
		public List<RegexPatternConfig> OptionPatterns { get; set; } = new List<RegexPatternConfig>();

		public ExamParserConfig()
		{
			MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^[一二三四五六七八九十]+\s*[、.]\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 1)); // 如: 一、选择题 (5分)
			MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^\d+\.\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 2)); // 如: 1. 填空题 (10分)
			MajorQuestionGroupPatterns.Add(new RegexPatternConfig(@"^(\(.+\))\s*(.+?)(?:\s*\((\d+)\s*分\))?$", 3)); // 如: (一) 文言文阅读 (8分)

			QuestionPatterns.Add(new RegexPatternConfig(@"^(\d+)\.\s*(.*)$", 1)); // 如: 1. 题干
			OptionPatterns.Add(new RegexPatternConfig(@"^[A-D]\.\s*(.*)$", 1));   // 如: A. 选项内容
		}
	}


	public class PotentialMatch
	{
		public int StartIndex { get; set; }
		public int EndIndex { get; set; } // 匹配到的结构在原始文本中的结束位置
		public string MatchedText { get; set; } // 匹配到的完整行或段落
		public Match RegexMatch { get; set; } // 原始的Regex.Match对象，方便获取捕获组
		public RegexPatternConfig PatternConfig { get; set; } // 匹配到的模式配置
		public MatchType Type { get; set; } // 枚举：MajorQuestionGroup, Question, Option, etc.
	}

	public enum MatchType
	{
		MajorQuestionGroup,
		Question,
		Option,
		Other // 如果有其他需要识别的类型
	}


	/// <summary>
	/// 负责扫描原始文本，收集所有潜在的匹配项（题组、题目、选项）。
	/// 它只进行匹配，不进行结构化归属。
	/// </summary>
	public class ExamDocumentScanner
	{
		private readonly ExamParserConfig _config;

		public ExamDocumentScanner(ExamParserConfig config)
		{
			_config = config;
		}

		/// <summary>
		/// 扫描给定的文本，返回所有潜在的匹配项，并按起始位置排序。
		/// </summary>
		/// <param name="text">要扫描的文本</param>
		/// <returns>所有匹配到的 PotentialMatch 列表</returns>
		public List<PotentialMatch> Scan(string text)
		{
			var allPotentialMatches = new List<PotentialMatch>();

			// 扫描所有题组模式
			foreach (var patternConfig in _config.MajorQuestionGroupPatterns)
			{
				foreach (Match match in patternConfig.Regex.Matches(text))
				{
					allPotentialMatches.Add(new PotentialMatch
					{
						StartIndex = match.Index,
						EndIndex = match.Index + match.Length,
						MatchedText = match.Value,
						RegexMatch = match,
						PatternConfig = patternConfig,
						Type = MatchType.MajorQuestionGroup
					});
				}
			}

			// 扫描所有题目模式
			foreach (var patternConfig in _config.QuestionPatterns)
			{
				foreach (Match match in patternConfig.Regex.Matches(text))
				{
					allPotentialMatches.Add(new PotentialMatch
					{
						StartIndex = match.Index,
						EndIndex = match.Index + match.Length,
						MatchedText = match.Value,
						RegexMatch = match,
						PatternConfig = patternConfig,
						Type = MatchType.Question
					});
				}
			}

			// 扫描所有选项模式
			foreach (var patternConfig in _config.OptionPatterns)
			{
				foreach (Match match in patternConfig.Regex.Matches(text))
				{
					allPotentialMatches.Add(new PotentialMatch
					{
						StartIndex = match.Index,
						EndIndex = match.Index + match.Length,
						MatchedText = match.Value,
						RegexMatch = match,
						PatternConfig = patternConfig,
						Type = MatchType.Option
					});
				}
			}

			// 统一按起始位置排序
			return allPotentialMatches.OrderBy(pm => pm.StartIndex).ToList();
		}
	}

public class ExamStructureBuilder
	{
		private readonly ExamParserConfig _config;

		public ExamStructureBuilder(ExamParserConfig config)
		{
			_config = config;
		}

		public ExamPaper BuildExamPaper(string fullExamText, List<PotentialMatch> allPotentialMatches)
		{
			var examPaper = new ExamPaper();
			examPaper.Title = GetExamTitle(fullExamText);

			var majorQGStack = new Stack<MajorQuestionGroup>();
			MajorQuestionGroup currentMajorQG = null;

			var questionStack = new Stack<Question>();
			Question currentQuestion = null;

			int currentContentStart = 0;


			if (allPotentialMatches.Any() && allPotentialMatches[0].StartIndex > 0)
			{
				string introText = fullExamText.Substring(0, allPotentialMatches[0].StartIndex).Trim();
				// 可以选择将这部分文本存储到 ExamPaper 的某个属性，例如 ExamPaper.Description
			}


			// 这里需要处理currentContentStart的位置,到allPotentialMatches[0].StartIndex


			for (int i = 0; i < allPotentialMatches.Count; i++)
			{
				var pm = allPotentialMatches[i];

				string precedingText = fullExamText.Substring(currentContentStart, pm.StartIndex - currentContentStart).Trim();
				if (!string.IsNullOrWhiteSpace(precedingText))
				{
					if (currentQuestion != null)
					{
						ProcessQuestionContent(currentQuestion, precedingText,
							GetSubMatchesForRange(allPotentialMatches, currentContentStart, pm.StartIndex));
					}
					else if (currentMajorQG != null)
					{
						currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + precedingText;
					}
					else
					{
						// 暂时忽略，或可以添加到 ExamPaper.Description
					}
				}

				if (pm.Type == MatchType.MajorQuestionGroup)
				{
					// 1. 确定当前 MajorQuestionGroup 的层级关系
					while (majorQGStack.Any() && pm.PatternConfig.Priority <= majorQGStack.Peek().Priority)
					{
						// 当前 QG 的优先级等于或高于栈顶 QG，说明栈顶 QG 已经结束
						majorQGStack.Pop();
					}

					MajorQuestionGroup newMajorQG = new MajorQuestionGroup
					{
						Title = pm.RegexMatch.Groups[1].Value.Trim(),
						Score = (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success) ? float.Parse(pm.RegexMatch.Groups[2].Value) : 0,
						Priority = pm.PatternConfig.Priority
					};

					if (majorQGStack.Any())
					{
						majorQGStack.Peek().SubMajorQuestionGroups.Add(newMajorQG);
					}
					else
					{
						examPaper.MajorQuestionGroups.Add(newMajorQG);
					}

					majorQGStack.Push(newMajorQG);
					currentMajorQG = newMajorQG;
					questionStack.Clear();
					currentQuestion = null;
				}
				else if (pm.Type == MatchType.Question)
				{
					// 1. 确定当前 Question 的层级关系（子题目）
					// 找到比当前 Question 优先级高或相等的 Question 作为其父级
					while (questionStack.Any() && pm.PatternConfig.Priority <= questionStack.Peek().Priority)
					{
						// 如果当前 Question 的优先级等于或高于栈顶 Question，说明栈顶 Question 已经结束
						questionStack.Pop();
					}

					Question newQuestion = new Question
					{
						Number = pm.RegexMatch.Groups[1].Value.Trim(),
						Text = pm.RegexMatch.Groups[2].Value.Trim(),
						Priority = pm.PatternConfig.Priority
					};
					if (pm.RegexMatch.Groups.Count > 2 && pm.RegexMatch.Groups[2].Success)
					{
						float.TryParse(pm.RegexMatch.Groups[2].Value, out float score);
						newQuestion.Score = score;
					}

					if (questionStack.Any())
					{
						questionStack.Peek().SubQuestions.Add(newQuestion);
					}
					else if (currentMajorQG != null)
					{
						// 归属于当前活跃的 MajorQuestionGroup
						currentMajorQG.Questions.Add(newQuestion);
					}
					else
					{
						// 没有活跃的 MajorQuestionGroup 或 Question，作为 ExamPaper 的顶级 Questions
						examPaper.TopLevelQuestions.Add(newQuestion);
					}

					questionStack.Push(newQuestion); // 新的 Question 入栈，成为当前活跃 Question
					currentQuestion = newQuestion;
				}
				else if (pm.Type == MatchType.Option)
				{
					// 选项必须归属于一个题目
					if (currentQuestion != null)
					{
						Option newOption = new Option
						{
							Label = pm.RegexMatch.Groups[1].Value.Trim(),
							Text = pm.RegexMatch.Groups[2].Value.Trim()
						};
						currentQuestion.Options.Add(newOption);
					}
					else
					{
						// 孤立的选项，可能需要日志记录或错误处理
						Console.WriteLine($"Warning: Found isolated Option at index {pm.StartIndex}: {pm.MatchedText}");
					}
				}

				// --- 步骤3: 更新 currentContentStart 为当前匹配点的 EndIndex ---
				// 下一次循环将从这里开始提取内容
				currentContentStart = pm.EndIndex;
			}

			// --- 步骤4: 处理循环结束后，最后一个匹配点之后到文本末尾的剩余内容 ---
			if (currentContentStart < fullExamText.Length)
			{
				string remainingText = fullExamText.Substring(currentContentStart).Trim();
				if (!string.IsNullOrWhiteSpace(remainingText))
				{
					if (currentQuestion != null)
					{
						// 最后一个题目后面的内容（可能是选项或多行描述）
						ProcessQuestionContent(currentQuestion, remainingText,
							 GetSubMatchesForRange(allPotentialMatches, currentContentStart, fullExamText.Length));
					}
					else if (currentMajorQG != null)
					{
						// 最后一个题组后面的内容（可能是描述或题目）
						currentMajorQG.Descript += (string.IsNullOrWhiteSpace(currentMajorQG.Descript) ? "" : "\n") + remainingText;
					}
					else
					{
						// 顶级剩余文本，可能作为 ExamPaper 的整体描述
						// examPaper.Description += remainingText;
					}
				}
			}

			return examPaper;
		}

		/// <summary>
		/// 提取试卷标题 (简单实现)
		/// </summary>
		private string GetExamTitle(string examPaperText)
		{
			var firstLine = examPaperText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
										  .FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
			return firstLine ?? "未识别试卷标题";
		}

		/// <summary>
		/// 获取给定 PotentialMatch 列表在指定范围内的子集。
		/// 这个方法用于辅助 ProcessQuestionContent，为其提供该范围内的 Options 和 SubQuestions。
		/// </summary>
		private List<PotentialMatch> GetSubMatchesForRange(List<PotentialMatch> allMatches, int start, int end)
		{
			// 注意：这里需要考虑 potentialMatches 的索引与 fullExamText 索引的映射
			// 这里的 StartIndex 是相对于 fullExamText 的
			return allMatches.Where(pm => pm.StartIndex >= start && pm.StartIndex < end).ToList();
		}

		/// <summary>
		/// 处理 Question 的内容，主要用于解析 Options 和识别非结构化文本。
		/// </summary>
		private void ProcessQuestionContent(Question question, string contentText, List<PotentialMatch> potentialMatchesInScope)
		{
			// 遍历当前范围内的所有 PotentialMatch，找出 Options
			var optionsText = new System.Text.StringBuilder();
			int lastOptionEndIndex = 0; // 记录最后一个处理的选项的结束位置

			foreach (var pm in potentialMatchesInScope.OrderBy(p => p.StartIndex))
			{
				// 检查是否是选项
				if (pm.Type == MatchType.Option)
				{
					// 收集选项之间的文本作为题干的延续或描述
					if (pm.StartIndex > lastOptionEndIndex)
					{
						string textBeforeOption = contentText.Substring(lastOptionEndIndex, pm.StartIndex - lastOptionEndIndex).Trim();
						if (!string.IsNullOrWhiteSpace(textBeforeOption))
						{
							question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + textBeforeOption;
						}
					}

					var newOption = new Option
					{
						Label = pm.RegexMatch.Groups[1].Value.Trim(),
						Text = pm.RegexMatch.Groups[2].Value.Trim()
					};
					question.Options.Add(newOption);
					lastOptionEndIndex = pm.EndIndex;
				}
				// TODO: 如果有 SubQuestion 类型，在这里也可以类似处理
				// else if (pm.Type == MatchType.Question && pm.PatternConfig.Priority > question.Priority)
				// {
				//     // 这是一个子题目，需要进一步解析
				//     // 递归调用，但这里的逻辑会更复杂，因为需要识别子题目自己的Options
				//     // 可能会在这里创建一个临时的 Question，然后递归 ProcessQuestionContent
				// }
			}

			// 处理所有选项之后剩余的文本
			if (lastOptionEndIndex < contentText.Length)
			{
				string remainingContent = contentText.Substring(lastOptionEndIndex).Trim();
				if (!string.IsNullOrWhiteSpace(remainingContent))
				{
					question.Text += (string.IsNullOrWhiteSpace(question.Text) ? "" : "\n") + remainingContent;
				}
			}
		}
	}

}