diff --git a/examples/rephrase/rephrase_style_controlled/README.md b/examples/rephrase/rephrase_style_controlled/README.md new file mode 100644 index 00000000..91f815b7 --- /dev/null +++ b/examples/rephrase/rephrase_style_controlled/README.md @@ -0,0 +1 @@ +# Rephrase with Style Control diff --git a/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh b/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh new file mode 100644 index 00000000..38b2f87c --- /dev/null +++ b/examples/rephrase/rephrase_style_controlled/rephrase_style_controlled.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml diff --git a/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml b/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml new file mode 100644 index 00000000..b7b1fb13 --- /dev/null +++ b/examples/rephrase/rephrase_style_controlled/style_controlled_rephrasing_config.yaml @@ -0,0 +1,36 @@ +global_params: + working_dir: cache + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/json_demo.json + + - id: chunk + op_name: chunk + type: map_batch + dependencies: + - read + execution_params: + replicas: 4 + params: + chunk_size: 2048 # larger chunk size for better context + chunk_overlap: 200 + + - id: rephrase + op_name: rephrase + type: map_batch + dependencies: + - chunk + execution_params: + replicas: 1 + batch_size: 128 + save_output: true + params: + method: style_controlled + style: critical_analysis diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py index 0727b3fa..8ac5117c 100644 --- a/graphgen/bases/__init__.py +++ b/graphgen/bases/__init__.py @@ -1,3 +1,4 @@ +from .base_evaluator import BaseEvaluator from .base_extractor import BaseExtractor from .base_generator import BaseGenerator from .base_kg_builder import BaseKGBuilder @@ -5,9 +6,9 @@ from .base_operator import BaseOperator from .base_partitioner import BasePartitioner from .base_reader import BaseReader +from .base_rephraser import BaseRephraser from .base_searcher import BaseSearcher from .base_splitter import BaseSplitter from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace from .base_tokenizer import BaseTokenizer -from .base_evaluator import BaseEvaluator from .datatypes import Chunk, Config, Node, QAPair, Token diff --git a/graphgen/bases/base_rephraser.py b/graphgen/bases/base_rephraser.py new file mode 100644 index 00000000..0d4dc5dd --- /dev/null +++ b/graphgen/bases/base_rephraser.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import Any + +from graphgen.bases.base_llm_wrapper import BaseLLMWrapper + + +class BaseRephraser(ABC): + """ + Rephrase text based on given prompts. + """ + + def __init__(self, llm_client: BaseLLMWrapper): + self.llm_client = llm_client + + @abstractmethod + def build_prompt(self, text: str) -> str: + """Build prompt for LLM based on the given text""" + + @staticmethod + @abstractmethod + def parse_response(response: str) -> Any: + """Parse the LLM response and return the rephrased text""" + + async def rephrase( + self, + item: dict, + ) -> dict: + text = item["content"] + prompt = self.build_prompt(text) + response = await self.llm_client.generate_answer(prompt) + return self.parse_response(response) diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 94fa1b30..74d382a2 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -37,6 +37,7 @@ RDFReader, TXTReader, ) +from .rephraser import StyleControlledRephraser from .searcher.db.ncbi_searcher import NCBISearch from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch diff --git a/graphgen/models/rephraser/__init__.py b/graphgen/models/rephraser/__init__.py new file mode 100644 index 00000000..6ab98a3e --- /dev/null +++ b/graphgen/models/rephraser/__init__.py @@ -0,0 +1 @@ +from .style_controlled_rephraser import StyleControlledRephraser diff --git a/graphgen/models/rephraser/style_controlled_rephraser.py b/graphgen/models/rephraser/style_controlled_rephraser.py new file mode 100644 index 00000000..4e053c92 --- /dev/null +++ b/graphgen/models/rephraser/style_controlled_rephraser.py @@ -0,0 +1,33 @@ +from typing import Any, Optional + +from graphgen.bases import BaseRephraser +from graphgen.templates import STYLE_CONTROLLED_REPHRASING_PROMPTS +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class StyleControlledRephraser(BaseRephraser): + """ + Style Controlled Rephraser rephrases the input text based on a specified style. + """ + + def __init__(self, llm_client: Any, style: str = "critical_analysis"): + super().__init__(llm_client) + self.style = style + + def build_prompt(self, text: str) -> str: + logger.debug("Text to be rephrased: %s", text) + language = detect_main_language(text) + prompt_template = STYLE_CONTROLLED_REPHRASING_PROMPTS[self.style][language] + prompt = prompt_template.format(text=text) + return prompt + + @staticmethod + def parse_response(response: str) -> Optional[dict]: + result = response.strip() + logger.debug("Raw rephrased response: %s", result) + if not result: + return None + return { + "_rephrased_id": compute_content_hash(result), + "content": result, + } diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index ab840cc5..676c2bb7 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -7,9 +7,9 @@ from .partition import PartitionService from .quiz import QuizService from .read import read +from .rephrase import RephraseService from .search import SearchService - operators = { "read": read, "chunk": ChunkService, @@ -21,4 +21,5 @@ "partition": PartitionService, "generate": GenerateService, "evaluate": EvaluateService, + "rephrase": RephraseService, } diff --git a/graphgen/operators/rephrase/__init__.py b/graphgen/operators/rephrase/__init__.py new file mode 100644 index 00000000..fe9ef692 --- /dev/null +++ b/graphgen/operators/rephrase/__init__.py @@ -0,0 +1 @@ +from .rephrase_service import RephraseService diff --git a/graphgen/operators/rephrase/rephrase_service.py b/graphgen/operators/rephrase/rephrase_service.py new file mode 100644 index 00000000..b5a3ec1f --- /dev/null +++ b/graphgen/operators/rephrase/rephrase_service.py @@ -0,0 +1,48 @@ +import pandas as pd + +from graphgen.bases import BaseLLMWrapper, BaseOperator +from graphgen.common import init_llm +from graphgen.utils import run_concurrent + + +class RephraseService(BaseOperator): + """ + Generate question-answer pairs based on nodes and edges. + """ + + def __init__( + self, + working_dir: str = "cache", + method: str = "aggregated", + **rephrase_kwargs, + ): + super().__init__(working_dir=working_dir, op_name="rephrase_service") + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + self.method = method + self.rephrase_kwargs = rephrase_kwargs + + if self.method == "style_controlled": + from graphgen.models import StyleControlledRephraser + + self.rephraser = StyleControlledRephraser( + self.llm_client, + style=rephrase_kwargs.get("style", "critical_analysis"), + ) + else: + raise ValueError(f"Unsupported rephrase method: {self.method}") + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + items = batch.to_dict(orient="records") + return pd.DataFrame(self.rephrase(items)) + + def rephrase(self, items: list[dict]) -> list[dict]: + results = run_concurrent( + self.rephraser.rephrase, + items, + desc="Rephrasing texts", + unit="batch", + ) + + # Filter out empty results + results = [res for res in results if res] + return results diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index 72ab9446..da3add05 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -14,5 +14,6 @@ VQA_GENERATION_PROMPT, ) from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT +from .rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS from .search_judgement import SEARCH_JUDGEMENT_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/rephrasing/__init__.py b/graphgen/templates/rephrasing/__init__.py new file mode 100644 index 00000000..da75223b --- /dev/null +++ b/graphgen/templates/rephrasing/__init__.py @@ -0,0 +1 @@ +from .style_controlled_rephrasing import STYLE_CONTROLLED_REPHRASING_PROMPTS diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py new file mode 100644 index 00000000..57101277 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/__init__.py @@ -0,0 +1,21 @@ +from .critical_analysis_rephrasing import CRITICAL_ANALYSIS_REPHRASING_PROMPTS +from .cross_domain_analogy_rephrasing import CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS +from .executive_summary_rephrasing import EXECUTIVE_SUMMARY_REPHRASING_PROMPTS +from .first_person_narrative_rephrasing import FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS +from .historical_evolution_perspective_rephrasing import ( + HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS, +) +from .popular_science_rephrasing import POPULAR_SCIENCE_REPHRASING_PROMPTS +from .qa_dialogue_format_rephrasing import QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS +from .technical_deep_dive_rephrasing import TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS + +STYLE_CONTROLLED_REPHRASING_PROMPTS = { + "popular_science": POPULAR_SCIENCE_REPHRASING_PROMPTS, + "critical_analysis": CRITICAL_ANALYSIS_REPHRASING_PROMPTS, + "cross_domain_analogy": CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS, + "technical_deep_dive": TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS, + "executive_summary": EXECUTIVE_SUMMARY_REPHRASING_PROMPTS, + "first_person_narrative": FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS, + "historical_evolution_perspective": HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS, + "qa_dialogue_format": QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py new file mode 100644 index 00000000..a6054178 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/critical_analysis_rephrasing.py @@ -0,0 +1,52 @@ +TEMPLATE_ZH = """ +【任务】以学术批判视角改写以下内容,形成技术评论文章。 + +【核心要求】 +1. 语气风格:客观理性,第三人称学术视角,使用规范学术用语 +2. 内容结构: + - 准确总结原文核心方法/发现(占比40%) + - 分析技术优势与创新点(占比20%) + - 指出潜在局限性与假设条件(占比20%) + - 提出可能的改进方向或未来工作(占比20%) +3. 引用规范:保留原文所有关键引用,采用标准学术引用格式 +4. 事实准确性:不得歪曲或误读原文技术细节 + +【输出格式】 +- 标题:原标题 + ":一项批判性分析" +- 段落:标准学术论文章节结构 +- 字数:与原文相当或略长 + +原文内容: +{text} + +请输出批判性分析改写版本: +""" + +TEMPLATE_EN = """ +【Task】Rewrite the following content from an academic critical perspective as a technical commentary. + +【Core Requirements】 +1. Tone: Objective and rational, third-person academic perspective, using standard academic terminology +2. Structure: + - Accurately summarize core methods/findings (40% of content) + - Analyze technical advantages and innovations (20%) + - Identify potential limitations and assumptions (20%) + - Propose possible improvements or future work (20%) +3. Citations: Retain all key references from original, using standard academic citation format +4. Factual Accuracy: Do not distort or misinterpret technical details + +【Output Format】 +- Title: Original Title + ": A Critical Analysis" +- Paragraphs: Standard academic paper structure +- Length: Similar to or slightly longer than original + +Original Content: +{text} + +Please output the critically analyzed rewrite: +""" + +CRITICAL_ANALYSIS_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py new file mode 100644 index 00000000..a3130a49 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/cross_domain_analogy_rephrasing.py @@ -0,0 +1,62 @@ +TEMPLATE_ZH = """ +【任务】通过跨领域类比解释技术概念。 + +【类比原则】 +- 类比源领域:生物学、物理学、建筑学、经济学、烹饪等领域 +- 类比强度:类比关系需直观且深刻,避免牵强附会 +- 目标:降低理解门槛,同时保持技术严谨性 + +【核心要求】 +1. 双轨并行:每个技术概念配一个恰当类比 +2. 类比结构: + - 先介绍技术概念(准确、完整) + - 再引入类比对象及其映射关系 + - 最后说明类比局限性和适用范围 +3. 保真红线:技术部分必须与原文完全一致,不得因类比而简化 +4. 创新性:鼓励使用新颖、出人意料但合理的类比 +5. 篇幅:可比原文扩展20-40% + +【评估标准】 +- 类比恰当性(技术概念与类比对象的核心机制必须同构) +- 技术准确性(不得扭曲事实) +- 启发性(帮助读者建立深层理解) + +原文内容: +{text} + +请输出跨领域类比版本: +""" + +TEMPLATE_EN = """ +【Task】Explain technical concepts through cross-domain analogies. + +【Analogy Principles】 +- Source Domains: Biology, physics, architecture, economics, cooking, etc. +- Strength: Analogy should be intuitive yet profound, avoid forced comparisons +- Goal: Lower understanding barrier while maintaining technical rigor + +【Core Requirements】 +1. Dual Track: Pair each technical concept with an appropriate analogy +2. Analogy Structure: + - First introduce technical concept (accurate and complete) + - Then introduce analogy object and mapping relationship + - Finally explain analogy limitations and applicable scope +3. Fidelity Baseline: Technical parts must be identical to original, no simplification for analogy's sake +4. Innovation: Encourage novel, surprising but reasonable analogies +5. Length: May expand 20-40% beyond original + +【Evaluation Criteria】 +- Analogy Appropriateness (core mechanisms must be isomorphic) +- Technical Accuracy (no factual distortion) +- Heuristic Value (helps build deep understanding) + +Original Content: +{text} + +Please output the cross-domain analogy version: +""" + +CROSS_DOMAIN_ANALOGY_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/executive_summary_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/executive_summary_rephrasing.py new file mode 100644 index 00000000..881b9343 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/executive_summary_rephrasing.py @@ -0,0 +1,64 @@ +TEMPLATE_ZH = """ +【任务】为高管层撰写决策摘要。 + +【读者假设】 +- 职位:CTO/技术VP/产品总监 +- 核心关切:技术价值、资源投入、竞争壁垒、商业影响 + +【核心要求】 +1. 信息密度:每句话必须传达战略价值 +2. 内容优先级: + - 核心技术突破与创新价值(必须) + - 与竞品的差异化优势(必须) + - 实施成本与资源需求(必须) + - 潜在商业应用场景(必须) + - 技术风险评估(可选) +3. 语言风格:金字塔原理,结论先行,数据支撑 +4. 简洁性:控制在原文长度的30-50% +5. 事实准确性:所有数据、性能指标必须与原文完全一致 + +【禁用表达】 +- 避免"可能"、"也许"等不确定表述 +- 禁用技术细节描述(除非直接影响决策) +- 避免行话和缩写 + +原文内容: +{text} + +请直接输出高管决策摘要: +""" + +TEMPLATE_EN = """ +【Task】Write an executive summary for C-suite decision-making. + +【Audience Assumption】 +- Position: CTO/VP of Engineering/Product Director +- Core Concerns: Technical value, resource investment, competitive moats, business impact + +【Core Requirements】 +1. Information Density: Every sentence must convey strategic value +2. Content Priority: + - Core technical breakthrough and innovation value (MUST) + - Differentiated advantages over competitors (MUST) + - Implementation cost and resource requirements (MUST) + - Potential business application scenarios (MUST) + - Technical risk assessment (OPTIONAL) +3. Language Style: Pyramid principle - lead with conclusions, support with data +4. Conciseness: 30-50% of original length +5. Factual Accuracy: All data and performance metrics must be identical to original + +【Prohibited Expressions】 +- Avoid uncertain terms like "maybe," "perhaps" +- No deep technical details (unless directly impacting decision) +- No jargon or unexplained acronyms + +Original Content: +{text} + +Please output the executive summary directly: +""" + +EXECUTIVE_SUMMARY_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/first_person_narrative_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/first_person_narrative_rephrasing.py new file mode 100644 index 00000000..24b79aae --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/first_person_narrative_rephrasing.py @@ -0,0 +1,60 @@ +TEMPLATE_ZH = """ +【任务】将技术文档改写为第一人称实践经验分享。 + +【角色设定】 +- 身份:资深技术实践者/研究员 +- 场景:技术博客/内部经验分享会 +- 目标读者:同行从业者 + +【核心要求】 +1. 视角:全程使用"我/我们"第一人称 +2. 内容融合: + - 保留原文所有技术事实(代码、数据、架构) + - 添加个人实践中的观察、挑战与解决思路 + - 分享真实应用场景和效果数据 +3. 语言风格:专业但亲和,避免过度口语化 +4. 叙事元素:可包含"最初尝试-遇到问题-调整思路-最终效果"的故事线 +5. 事实红线:技术细节必须与原文完全一致,不得虚构数据 + +【禁止】 +- 不得编造不存在的个人经历 +- 不得改变技术实现细节 + +原文内容: +{text} + +请直接输出第一人称叙事版本: +""" + +TEMPLATE_EN = """ +【Task】Rewrite the technical document as a first-person practical experience sharing. + +【Role Setting】 +- Identity: Senior practitioner/researcher +- Scenario: Technical blog/internal sharing session +- Target Audience: Peer professionals + +【Core Requirements】 +1. Perspective: Use first-person "I/we" throughout +2. Content Integration: + - Retain ALL technical facts (code, data, architecture) from original + - Add personal observations, challenges, and solution approaches from practice + - Share real application scenarios and performance data +3. Language Style: Professional yet approachable, avoid excessive colloquialism +4. Narrative: May include "initial attempt-encountered problem-adjusted approach-final result" storyline +5. Factual Baseline: Technical details must be identical to original, no fabricated data + +【Prohibited】 +- Do not invent non-existent personal experiences +- Do not alter technical implementation details + +Original Content: +{text} + +Please output the first-person narrative version directly: +""" + +FIRST_PERSON_NARRATIVE_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/historical_evolution_perspective_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/historical_evolution_perspective_rephrasing.py new file mode 100644 index 00000000..68cab3b9 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/historical_evolution_perspective_rephrasing.py @@ -0,0 +1,68 @@ +TEMPLATE_ZH = """ +【任务】按技术发展史视角重构内容。 + +【叙事框架】 +- 时间轴线:从起源→关键突破→当前状态→未来趋势 +- 演进逻辑:揭示"技术瓶颈突破→新范式建立→新问题出现"的循环 + +【核心要求】 +1. 时间准确性:所有时间点、版本号、发布顺序必须核实准确 +2. 因果链: + - 明确每个演进阶段的驱动力(理论突破/工程需求/硬件进步) + - 指出技术演进的必然性与偶然性 +3. 内容结构: + - 背景与起源(技术诞生前的状态) + - 关键里程碑(带具体时间) + - 范式转移(革命性变化) + - 当前成熟形态 + - 未来展望(基于原文技术路径) +4. 技术保真:所有技术描述必须与原文事实一致 +5. 分析深度:不能仅罗列事实,必须揭示演进逻辑 + +【输出规范】 +- 使用时间轴标记(如[2017]、[2020])增强可读性 +- 关键人物/团队需保留原名 +- 禁止编造不存在的技术演进路径 + +原文内容: +{text} + +请输出历史演进视角版本: +""" + +TEMPLATE_EN = """ +【Task】Reconstruct content from a technological history evolution perspective. + +【Narrative Framework】 +- Timeline: Origin → Key Breakthroughs → Current State → Future Trends +- Evolution Logic: Reveal the cycle of "technical bottleneck breakthrough → new paradigm establishment → new problems emerge" + +【Core Requirements】 +1. Temporal Accuracy: ALL dates, version numbers, and release sequences must be verified and accurate +2. Causality Chain: + - Identify drivers of each evolution stage (theoretical breakthrough/engineering needs/hardware advances) + - Point out inevitability and contingency of technical evolution +3. Content Structure: + - Background & Origin (state before technology birth) + - Key Milestones (with specific dates) + - Paradigm Shifts (revolutionary changes) + - Current Mature Form + - Future Outlook (based on original's technical trajectory) +4. Technical Fidelity: ALL technical descriptions must be factually consistent with original +5. Analytical Depth: Must reveal evolution logic, not just list facts + +【Output Specification】 +- Use timeline markers ([2017], [2020]) for readability +- Keep original names of key people/teams +- DO NOT invent non-existent evolution paths + +Original Content: +{text} + +Please output the historical evolution version: +""" + +HISTORICAL_EVOLUTION_PERSPECTIVE_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/popular_science_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/popular_science_rephrasing.py new file mode 100644 index 00000000..4c03c36f --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/popular_science_rephrasing.py @@ -0,0 +1,46 @@ +TEMPLATE_ZH = """ +【任务】将以下技术文档改写为面向普通读者的科普文章。 + +【核心要求】 +1. 语言风格:生动活泼,避免冷僻专业术语;必须使用术语时,需用生活化比喻或类比解释 +2. 内容保真:所有核心事实、数据和技术结论必须准确无误,不得篡改或过度简化 +3. 叙事结构:采用"问题-发现-应用"的故事线,增强可读性 +4. 读者定位:假设读者具有高中文化水平,无专业背景 +5. 篇幅控制:可适当扩展,但每段聚焦一个核心概念 + +【禁止行为】 +- 不得删除关键技术细节 +- 不得改变原意或事实 +- 避免使用"这个东西"、"那个技术"等模糊指代 + +原文内容: +{text} + +请直接输出改写后的科普文章: +""" + +TEMPLATE_EN = """ +【Task】Rewrite the following technical document as a popular science article for general readers. + +【Core Requirements】 +1. Language Style: Lively and engaging; avoid jargon; when technical terms are necessary, explain with everyday analogies or metaphors +2. Content Fidelity: All core facts, data, and technical conclusions must be accurate. Do not distort or oversimplify +3. Narrative Structure: Use a "problem-discovery-application" storyline to enhance readability +4. Audience: Assume high school education level, no technical background +5. Length: May expand moderately, but each paragraph should focus on one core concept + +【Prohibited】 +- Do not remove key technical details +- Do not change original meaning or facts +- Avoid vague references like "this thing" or "that technology" + +Original Content: +{text} + +Please output the rewritten popular science article directly: +""" + +POPULAR_SCIENCE_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/qa_dialogue_format_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/qa_dialogue_format_rephrasing.py new file mode 100644 index 00000000..8c63cce9 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/qa_dialogue_format_rephrasing.py @@ -0,0 +1,73 @@ +TEMPLATE_ZH = """ +【任务】将技术文档重构为自然问答对话。 + +【对话设计原则】 +- 对话角色:提问者(好奇心驱动的学习者) vs 解答者(专家) +- 问题序列:从基础概念→技术细节→应用实践→深度追问,逻辑递进 + +【核心要求】 +1. 问题设计: + - 每个问题必须源于原文知识点 + - 问题要具体、明确,避免空泛 + - 体现真实学习过程中的疑惑点 +2. 回答规范: + - 回答必须准确、完整,引用原文事实 + - 保持专家解答的权威性 + - 可适当补充背景信息帮助理解 +3. 对话流畅性:问题间有自然过渡,避免跳跃 +4. 覆盖度:确保原文所有重要知识点都被至少一个问题覆盖 +5. 事实核查:回答中的技术细节、数据必须与原文完全一致 + +【输出格式】 +Q1: [问题1] +A1: [回答1] + +Q2: [问题2] +A2: [回答2] +... + +原文内容: +{text} + +请输出问答对话版本: +""" + +TEMPLATE_EN = """ +【Task】Reconstruct the technical document as a natural Q&A dialogue. + +【Dialogue Design Principles】 +- Roles: Inquirer (curious learner) vs. Expert (domain specialist) +- Question Flow: From basic concepts → technical details → practical applications → deep follow-ups, logically progressive + +【Core Requirements】 +1. Question Design: + - Each question must originate from original content knowledge points + - Questions should be specific and clear, avoid vagueness + - Reflect points of confusion in the real learning process +2. Answer Specification: + - Answers must be accurate and complete, citing original facts + - Maintain authoritative expert tone + - May supplement background information when helpful +3. Dialogue Fluency: Natural transition between questions, avoid jumping +4. Coverage: Ensure ALL important knowledge points from original are covered by at least one question +5. Fact Check: Technical details and data in answers must be identical to original + +【Output Format】 +Q1: [Question 1] +A1: [Answer 1] + +Q2: [Question 2] +A2: [Answer 2] +... + +Original Content: +{text} + +Please output the Q&A dialogue version: +""" + + +QA_DIALOGUE_FORMAT_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +} diff --git a/graphgen/templates/rephrasing/style_controlled_rephrasing/technical_deep_dive_rephrasing.py b/graphgen/templates/rephrasing/style_controlled_rephrasing/technical_deep_dive_rephrasing.py new file mode 100644 index 00000000..3e0ab691 --- /dev/null +++ b/graphgen/templates/rephrasing/style_controlled_rephrasing/technical_deep_dive_rephrasing.py @@ -0,0 +1,66 @@ +TEMPLATE_ZH = """ +【任务】以领域专家视角进行深度技术剖析。 + +【读者定位】 +- 目标读者:同领域高级工程师/研究员 +- 预期效果:揭示技术细节、设计权衡与实现原理 + +【核心要求】 +1. 技术精确性: + - 使用精确的专业术语和符号表示 + - 补充技术背景、相关工作和理论基础 + - 必要时用公式或代码片段说明 +2. 深度维度: + - 算法复杂度分析 + - 系统架构设计权衡 + - 性能瓶颈与优化空间 + - 边界条件和异常情况处理 +3. 内容扩展:可在原文基础上增加30-50%的技术细节 +4. 语气:权威、严谨、逻辑严密 + +【输出规范】 +- 保持原文所有事实准确无误 +- 新增细节需符合领域常识 +- 使用标准技术文档格式 + +原文内容: +{text} + +请输出技术深度剖析版本: +""" + +TEMPLATE_EN = """ +【Task】Conduct an in-depth technical analysis from a domain expert perspective. + +【Audience】 +- Target: Senior engineers/researchers in the same field +- Goal: Reveal technical details, design trade-offs, and implementation principles + +【Core Requirements】 +1. Technical Precision: + - Use precise technical terminology and notation + - Supplement with technical background, related work, and theoretical foundations + - Include formulas or code snippets when necessary +2. Depth Dimensions: + - Algorithmic complexity analysis + - System architecture design trade-offs + - Performance bottlenecks and optimization opportunities + - Edge cases and exception handling +3. Content Expansion: May add 30-50% more technical details than original +4. Tone: Authoritative, rigorous, logically sound + +【Output Specification】 +- Maintain 100% factual accuracy from original +- Added details must align with domain common knowledge +- Use standard technical documentation format + +Original Content: +{text} + +Please output the technical deep-dive version: +""" + +TECHNICAL_DEEP_DIVE_REPHRASING_PROMPTS = { + "zh": TEMPLATE_ZH, + "en": TEMPLATE_EN, +}