|
400 | 400 | "class InlineMath(Markdown):\n", |
401 | 401 | " def __init__(self, content):\n", |
402 | 402 | " super().__init__(content)\n", |
| 403 | + " self.delimiter_size = 2\n", |
403 | 404 | "\n", |
404 | 405 | " def __str__(self):\n", |
405 | 406 | " return f\"InlineMath({self.content!r})\"\n", |
|
411 | 412 | "class DisplayMath(Markdown): \n", |
412 | 413 | " def __init__(self, content):\n", |
413 | 414 | " super().__init__(content)\n", |
| 415 | + " self.delimiter_size = 4\n", |
414 | 416 | "\n", |
415 | 417 | " def __str__(self):\n", |
416 | 418 | " return f\"DisplayMath({self.content!r})\"\n", |
|
1231 | 1233 | "# but overall the position should be fairly accurate.\n", |
1232 | 1234 | "\n", |
1233 | 1235 | "def improve_trim(text: str, start: int, end: int) -> str:\n", |
1234 | | - " markdown_classes = convert_markdown_to_classes_by_lines(text)\n", |
1235 | | - " index = 0\n", |
| 1236 | + " markdown_classes = convert_markdown_to_classes(text)\n", |
| 1237 | + " # print(markdown_classes)\n", |
| 1238 | + " text_index = 0\n", |
| 1239 | + " class_index = 0\n", |
| 1240 | + " improved_start = -1\n", |
| 1241 | + " improved_end = -1\n", |
1236 | 1242 | "\n", |
1237 | | - " for i in range(len(markdown_classes)):\n", |
1238 | | - " structure = markdown_classes[i]\n", |
| 1243 | + " while class_index < len(markdown_classes):\n", |
| 1244 | + " structure = markdown_classes[class_index]\n", |
1239 | 1245 | "\n", |
1240 | 1246 | " match structure:\n", |
1241 | 1247 | " case RegularText():\n", |
1242 | | - " if len(structure.content) + index < start:\n", |
1243 | | - " # start is not in this structure\n", |
1244 | | - " index += len(structure.content) + 1\n", |
1245 | | - " continue\n", |
1246 | | - " else:\n", |
1247 | | - " # start is in this structure\n", |
1248 | | - " structure_length = len(structure.content)\n", |
1249 | | - " structure.content = structure.content[start - index:]\n", |
1250 | | - " index += structure_length + 1\n", |
1251 | | - " continue\n", |
1252 | | - " return \"\"\n", |
| 1248 | + " structure_length = len(structure.content)\n", |
| 1249 | + " temp_improved_start = 0\n", |
| 1250 | + " temp_improved_end = len(structure.content)\n", |
| 1251 | + " if text_index <= start and structure_length + text_index > start:\n", |
| 1252 | + " improved_start = class_index\n", |
| 1253 | + " temp_improved_start = start - text_index\n", |
| 1254 | + "\n", |
| 1255 | + " if text_index <= end and structure_length + text_index >= end:\n", |
| 1256 | + " improved_end = class_index\n", |
| 1257 | + " temp_improved_end = end - text_index\n", |
| 1258 | + " \n", |
| 1259 | + " structure.content = structure.content[temp_improved_start:temp_improved_end]\n", |
| 1260 | + " text_index += structure_length\n", |
| 1261 | + "\n", |
| 1262 | + " case InlineMath() | DisplayMath():\n", |
| 1263 | + " structure_length = len(structure.content) + structure.delimiter_size\n", |
| 1264 | + " if text_index <= start and structure_length + text_index > start:\n", |
| 1265 | + " improved_start = class_index\n", |
| 1266 | + " if text_index <= end and structure_length + text_index >= end:\n", |
| 1267 | + " improved_end = class_index\n", |
| 1268 | + " text_index += structure_length\n", |
| 1269 | + " \n", |
| 1270 | + " class_index += 1\n", |
| 1271 | + "\n", |
| 1272 | + "\n", |
| 1273 | + " ret = markdown_classes[improved_start:improved_end + 1]\n", |
| 1274 | + " # print(ret)\n", |
| 1275 | + " # print(len(text), start, end)\n", |
| 1276 | + " # print(improved_start, improved_end)\n", |
| 1277 | + "\n", |
| 1278 | + " return convert_classes_to_markdown(ret)\n", |
1253 | 1279 | "\n" |
1254 | 1280 | ] |
1255 | 1281 | }, |
|
1337 | 1363 | " \"\"\"\n", |
1338 | 1364 | "\n", |
1339 | 1365 | "\n", |
1340 | | - "def trim_question(question: Set_Question_With_Solution) -> Set_Question_With_Solution:\n", |
| 1366 | + "def trim_question(question: tuple[int, Set_Question_With_Solution]) -> Set_Question_With_Solution:\n", |
| 1367 | + " question_number, question = question\n", |
| 1368 | + " question_number += 1\n", |
1341 | 1369 | "\n", |
1342 | 1370 | " def trim_question_content(content_text: str) -> str:\n", |
| 1371 | + "\n", |
1343 | 1372 | " if content_text == \"\":\n", |
1344 | 1373 | " return content_text\n", |
1345 | 1374 | "\n", |
|
1368 | 1397 | " try:\n", |
1369 | 1398 | " parsed_output = content_parser.parse(response.content)\n", |
1370 | 1399 | " start = content_text.index(parsed_output.start)\n", |
1371 | | - " end = content_text.index(parsed_output.end)\n", |
1372 | | - " print(\"Successfully trimmed the stem.\")\n", |
| 1400 | + " end = content_text.index(parsed_output.end) + len(parsed_output.end)\n", |
| 1401 | + " print(f\"Successfully trimmed the stem of question {question_number}.\")\n", |
1373 | 1402 | "\n", |
1374 | | - " return content_text[start:end + len(parsed_output.end) + 1].strip()\n", |
| 1403 | + " return improve_trim(content_text, start, end)\n", |
1375 | 1404 | " except Exception as e:\n", |
1376 | | - " print(f\"Error parsing LLM response as JSON for trimming content:\")\n", |
| 1405 | + " print(f\"Error parsing LLM response as JSON for trimming content of question {question_number}:\")\n", |
1377 | 1406 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
1378 | 1407 | " time.sleep(2)\n", |
1379 | 1408 | " else:\n", |
1380 | 1409 | " print(\"Final LLM Response:\")\n", |
1381 | 1410 | " print(response.content)\n", |
1382 | 1411 | " raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming content.\")\n", |
1383 | 1412 | "\n", |
1384 | | - " def trim_question_part(part_text: str) -> str:\n", |
| 1413 | + " def trim_question_part(part: tuple[int, str]) -> str:\n", |
| 1414 | + " part_number, part_text = part\n", |
| 1415 | + " part_number += 1\n", |
1385 | 1416 | " if part_text == \"\":\n", |
1386 | 1417 | " return part_text\n", |
1387 | 1418 | " \n", |
|
1410 | 1441 | " try:\n", |
1411 | 1442 | " parsed_output = part_parser.parse(response.content)\n", |
1412 | 1443 | " start = part_text.index(parsed_output.start)\n", |
1413 | | - " end = part_text.index(parsed_output.end)\n", |
1414 | | - " print(\"Successfully trimmed part\")\n", |
| 1444 | + " end = part_text.index(parsed_output.end) + len(parsed_output.end)\n", |
| 1445 | + " print(f\"Successfully trimmed part of question {question_number}, part {part_number}.\")\n", |
1415 | 1446 | "\n", |
1416 | | - " return part_text[start:end + len(parsed_output.end) + 1].strip()\n", |
| 1447 | + " return improve_trim(part_text, start, end)\n", |
1417 | 1448 | " except Exception as e:\n", |
1418 | | - " print(f\"Error parsing LLM response as JSON for trimming part:\")\n", |
| 1449 | + " print(f\"Error parsing LLM response as JSON for trimming part for question {question_number}, part {part_number}\")\n", |
1419 | 1450 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
1420 | 1451 | " time.sleep(2)\n", |
1421 | 1452 | " else:\n", |
1422 | 1453 | " print(\"Final LLM Response:\")\n", |
1423 | 1454 | " print(response.content)\n", |
1424 | 1455 | " raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming part.\")\n", |
1425 | 1456 | "\n", |
1426 | | - " def trim_question_part_solution(solution_text: str) -> str:\n", |
| 1457 | + " def trim_question_part_solution(solution: tuple[int, str]) -> str:\n", |
| 1458 | + " part_number, solution_text = solution\n", |
| 1459 | + " part_number += 1\n", |
1427 | 1460 | " if solution_text == \"\":\n", |
1428 | 1461 | " return solution_text\n", |
1429 | 1462 | " \n", |
|
1452 | 1485 | " try:\n", |
1453 | 1486 | " parsed_output = solution_parser.parse(response.content)\n", |
1454 | 1487 | " start = solution_text.index(parsed_output.start)\n", |
1455 | | - " end = solution_text.index(parsed_output.end)\n", |
1456 | | - " print(\"Successfully trimmed part-solution.\")\n", |
| 1488 | + " end = solution_text.index(parsed_output.end) + len(parsed_output.end)\n", |
| 1489 | + " print(f\"Successfully trimmed part-solution for question {question_number}, part {part_number}.\")\n", |
1457 | 1490 | "\n", |
1458 | | - " return solution_text[start:end + len(parsed_output.end) + 1].strip()\n", |
| 1491 | + " return improve_trim(solution_text, start, end)\n", |
1459 | 1492 | " except Exception as e:\n", |
1460 | | - " print(f\"Error parsing LLM response as JSON for trimming solution part:\")\n", |
| 1493 | + " print(f\"Error parsing LLM response as JSON for trimming solution part for question {question_number}, part {part_number}\")\n", |
1461 | 1494 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
1462 | 1495 | " time.sleep(2)\n", |
1463 | 1496 | "\n", |
|
1469 | 1502 | " question.content = trim_question_content(question.content)\n", |
1470 | 1503 | "\n", |
1471 | 1504 | " with concurrent.futures.ThreadPoolExecutor() as executor:\n", |
1472 | | - " question.parts = list(executor.map(trim_question_part, question.parts))\n", |
1473 | | - " question.parts_solutions = list(executor.map(trim_question_part_solution, question.parts_solutions))\n", |
| 1505 | + " question.parts = list(executor.map(trim_question_part, enumerate(question.parts)))\n", |
| 1506 | + " question.parts_solutions = list(executor.map(trim_question_part_solution, enumerate(question.parts_solutions)))\n", |
1474 | 1507 | "\n", |
1475 | 1508 | " return question\n", |
1476 | 1509 | "\n", |
1477 | 1510 | "def trim_text(set_questions: Set_Lines) -> Set_Lines:\n", |
1478 | 1511 | "\n", |
1479 | 1512 | " with concurrent.futures.ThreadPoolExecutor() as executor:\n", |
1480 | | - " set_questions.questions = list(executor.map(trim_question, set_questions.questions))\n", |
| 1513 | + " set_questions.questions = list(executor.map(trim_question, enumerate(set_questions.questions)))\n", |
1481 | 1514 | "\n", |
1482 | 1515 | " return set_questions\n" |
1483 | 1516 | ] |
|
0 commit comments