This commit is contained in:
jack dehao li (李德豪) 2025-05-12 14:18:19 +08:00
commit 4af0f9c4a1
8 changed files with 795 additions and 0 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# 默认忽略的文件
/shelf/
/workspace.xml

9
.idea/misc.xml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (llama_factory_local)" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="Python 3.12 (llama_factory_local)" project-jdk-type="Python SDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/offline_data_model_pipline.iml" filepath="$PROJECT_DIR$/.idea/offline_data_model_pipline.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="FacetManager">
<facet type="Python" name="Python">
<configuration sdkName="Python 3.12 (llama_factory_local)" />
</facet>
</component>
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Python 3.12 (llama_factory_local) interpreter library" level="application" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,154 @@
"""
政务12345全国数据生成系统
功能
1. 支持全国范围地理位置生成
2. 多层级分类扩展
3. 数据保存至Excel
4. 真实业务场景模拟
"""
import pandas as pd
from openai import OpenAI
import random
import time
import re
from typing import List, Dict, Tuple
client = OpenAI(api_key="your-api-key")
class NationalDataGenerator:
def __init__(self, excel_path: str, category_column: str):
self.base_categories = self._load_excel_categories(excel_path, category_column)
self.location_pool = self._generate_national_locations()
self.expanded_categories = self._expand_categories_with_gpt()
self.used_records = set()
def _load_excel_categories(self, path: str, column: str) -> List[str]:
"""从Excel读取基础分类"""
df = pd.read_excel(path)
return df[column].dropna().unique().tolist()
def _generate_national_locations(self, num=200) -> List[str]:
"""生成全国真实地理位置库"""
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[{
"role": "user",
"content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城"
}]
)
return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")]
def _expand_categories_with_gpt(self) -> Dict[str, List[str]]:
"""GPT扩展分类体系"""
category_map = {}
for base_cat in self.base_categories:
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[{
"role": "user",
"content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型示例\n- 类型1施工许可违规\n- 类型2夜间施工超时"
}]
)
sub_cats = [re.sub(r".*", "", line.split(" ")[1])
for line in response.choices[0].message.content.strip().split("\n")]
category_map[base_cat] = sub_cats
time.sleep(1)
return category_map
def generate_dataset(self, num_records: int) -> pd.DataFrame:
"""生成核心数据集"""
data = []
while len(data) < num_records:
base_cat = random.choice(self.base_categories)
sub_cat = random.choice(self.expanded_categories[base_cat])
location = random.choice(self.location_pool)
content, keywords = self._generate_content(base_cat, sub_cat, location)
if content and self._validate_record(content, keywords, base_cat):
data.append({
"ID": len(data)+1,
"内容": content,
"关键词": " ".join(keywords),
"参考答案": base_cat,
"细分类型": sub_cat,
"地理位置": location
})
time.sleep(1.2)
return pd.DataFrame(data)
def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]:
"""生成政务工单内容"""
prompt = f"""生成真实可信的12345政务工单要求
1. 主分类{base_cat}
2. 细分类型{sub_cat}
3. 发生地点{location}
4. 包含要素时间具体问题影响范围市民诉求
5. 生成5个关键词必须包含{base_cat}
6. 内容长度80-150
示例格式
市民反映{location}某建筑工地违规夜间施工至凌晨噪音严重干扰周边居民已向环保部门投诉3次未解决要求立即停工整顿
关键词夜间施工 噪音污染 环保投诉 施工许可 居民维权"""
try:
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "你是政务数据生成专家"},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=600
)
raw_text = response.choices[0].message.content.strip()
return self._parse_generated_text(raw_text)
except Exception as e:
print(f"生成失败:{str(e)}")
return None, []
def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]:
"""解析生成文本"""
content = re.sub(r"关键词:.*", "", text).strip()
keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5]
return content, keywords
def _validate_record(self, content: str, keywords: List[str], category: str) -> bool:
"""五重数据校验"""
return (
len(content) >= 80 and
len(keywords) == 5 and
category in keywords and
content not in self.used_records and
any(c.isdigit() for c in content) # 包含数字要素
)
# 输入文件示例input.xlsx
"""
| 基础分类 |
|--------------|
| 施工管理 |
| 消费维权 |
| 城市管理 |
| 公共服务 |
"""
if __name__ == "__main__":
# 初始化生成器
generator = NationalDataGenerator(
excel_path="input.xlsx",
category_column="基础分类"
)
# 生成100条数据
df = generator.generate_dataset(100)
# 保存到Excel
with pd.ExcelWriter("government_12345_data.xlsx") as writer:
df.to_excel(writer, index=False)
print("生成数据示例:")
print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False))

View File

@ -0,0 +1,394 @@
import os
import json
import random
from typing import List, Dict, Tuple
from openai import OpenAI
from faker import Faker
class FullyDynamicGenerator:
def __init__(self):
self.llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.faker = Faker('zh_CN')
self.dynamic_memory = {}
self.special_cases = [
"方言沟通", "老年人口齿不清", "情绪激动打断对话",
"背景噪音干扰", "信号断续"
]
def generate_dialog(self, category: str, subcategory: str) -> List[Dict]:
"""全动态对话生成入口"""
scene_knowledge = self.generate_scene_knowledge(category, subcategory)
self.dynamic_memory[f"{category}_{subcategory}"] = scene_knowledge
dialog = []
dialog.extend(self._generate_complex_opening(category, subcategory))
dialog.extend(self._generate_obstacle_base_phase(scene_knowledge, subcategory))
dialog.extend(self._generate_verification_with_challenges(dialog))
dialog.extend(self._generate_technical_extend_phase(scene_knowledge, subcategory))
dialog.extend(self._generate_final_confirmation(scene_knowledge, subcategory))
return self._format_output(dialog)
def _generate_complex_opening(self, category: str, subcategory: str) -> List[Tuple]:
"""生成带复杂情形的开场对话"""
phase = []
special_case = random.choice(self.special_cases + [None]*3)
citizen_traits = {
"方言": random.choice(["带浓重口音", "夹杂方言词汇", "语法不规范"]),
"老年人": random.choice(["说话缓慢", "重复语句", "耳背听不清"]),
"情绪化": random.choice(["不断打断", "提高音量", "带哭腔"])
}
opening_prompt = f"""生成市民反映{subcategory}问题的电话开场白,要求:
1. 必须包含"您好"等礼貌用语
2. 体现真实通话特征{citizen_traits.get(special_case, "正常沟通")}
3. 包含具体问题细节"""
opening = self._safe_llm_call(
prompt=opening_prompt,
system="你擅长模拟各类人群的真实对话",
response_format={"type": "json_object"}
)
try:
opening_data = json.loads(opening)
opening_text = opening_data.get("text", f"您好,我要反映{subcategory}问题")
if special_case == "方言沟通":
opening_text = self._add_dialect_features(opening_text)
except:
opening_text = f"您好,我想投诉{subcategory}问题"
phase.append(("市民", "open_call", opening_text))
response_prompt = f"""根据市民来电特征:{special_case if special_case else '正常'},生成专业应答:
1. 包含工号和服务承诺
2. 适应沟通特征{citizen_traits.get(special_case, '标准服务')}"""
response = self._safe_llm_call(
prompt=response_prompt,
system="你是适应力强的专业客服",
response_format={"type": "json_object"}
)
try:
response_data = json.loads(response)
response_text = response_data.get("text", f"感谢来电,工号{random.randint(1000,1999)}为您服务")
if special_case == "老年人口齿不清":
response_text += "(放慢语速)请您慢慢说"
except:
response_text = "您好,政务热线为您服务"
phase.append(("客服", "agent_response", response_text))
if special_case in ["方言沟通", "老年人口齿不清", "信号断续"]:
phase.append(("客服", "double_check", f"抱歉,刚才没有听清楚,您是说{subcategory}问题对吗?"))
phase.append(("市民", "clarify", random.choice([
"对,就是这个问题",
f"不是,是{random.choice(['更严重','其他'])}的问题",
"(声音断断续续)喂...听得到吗?"
])))
return phase
def _generate_obstacle_base_phase(self, knowledge: Dict, scene: str) -> List[Tuple]:
"""生成带沟通障碍的基础信息采集"""
phase = []
required_fields = ["时间", "地点", "事件描述", "联系方式", "姓氏"]
for field in required_fields:
if random.random() < 0.1:
unclear_question = self._safe_llm_call(
prompt=f"生成有歧义的{field}询问话术",
system="故意制造1-2处不明确表述"
) or f"那个...关于{field}的情况能不能说下?"
phase.append(("客服", "unclear_question", unclear_question))
phase.append(("市民", "confused", "您问的是什么?我没听明白"))
question = self._safe_llm_call(
prompt=f"重新生成清晰的{field}询问话术",
system="使用最简明的表达"
) or f"请提供{field}的具体信息"
phase.append(("客服", "retry_question", question))
else:
question = self._safe_llm_call(
prompt=f"生成政务热线询问{field}的标准话术,场景:{scene}",
system="要求1.使用敬语 2.明确信息要求"
) or f"请问{scene}{field}是?"
phase.append(("客服", "info_request", question))
answer, needs_clarify = self._generate_complex_answer(scene, field)
phase.append(("市民", "info_response", answer))
if needs_clarify:
clarify_question = self._safe_llm_call(
prompt=f"根据模糊回答'{answer}'生成澄清{field}的追问",
system="要求1.指出不明确处 2.提供填写范例"
) or f"您提供的{field}不够具体,请补充(例:{self._get_field_example(field)}"
phase.append(("客服", "clarify_request", clarify_question))
if random.random() < 0.1:
phase.append(("市民", "refuse", random.choice([
"这么麻烦不说了!",
"你们政府办事就是繁琐",
f"{field}有什么好问的!"
])))
phase.append(("客服", "calm_down", random.choice([
"理解您的心情,但详细信息能帮助我们更快解决问题",
"抱歉给您带来不便,这是必要流程"
])))
phase.append(("市民", "clarified_response", f"哦,应该是{self._get_field_example(field)}"))
return phase
def _generate_complex_answer(self, scene: str, field: str) -> Tuple[str, bool]:
"""生成带复杂特征的市民回答"""
if random.random() < 0.15:
special_answers = {
"时间": [
("就...就那个...前几天", True),
("(背景嘈杂)喂?时间啊...上周?", True),
("我不记得了!你们自己查!", False)
],
"地点": [
("俺们村东头那个...那个啥来着", True),
("(信号不好)在...哗哗...超市附近", True),
("这么简单的问题都处理不了?", False)
]
}
return random.choice(special_answers.get(field, [("这个我说不好", True)]))
answers = {
"时间": [
(f"{random.choice(['今天','昨天'])}{random.randint(1,12)}点左右", False),
(f"持续{random.randint(2,24)}小时了", False)
],
"地点": [
(f"{self.faker.building_number()}{random.choice(['东侧','南门'])}", False),
(f"{self.faker.street_name()}附近", True)
],
"联系方式": [
(f"{self.faker.phone_number()[:3]}****", True),
(f"固话:{self.faker.phone_number()[:4]}-{self.faker.phone_number()[-4:]}", False)
],
"姓氏": [
(f"免贵姓{self.faker.last_name()}", False),
("叫我老李就行", True)
]
}
return random.choice(answers.get(field, [("具体情况是这样的...", False)]))
def _generate_verification_with_challenges(self, previous_dialog: List[Tuple]) -> List[Tuple]:
"""生成带挑战的信息确认环节"""
phase = []
collected_info = {}
for turn in previous_dialog:
if turn[1] in ["info_response", "clarified_response"]:
for field in ["时间", "地点", "姓氏"]:
if field in turn[2]:
collected_info[field] = turn[2]
if random.random() < 0.1:
collected_info[field] = self._get_wrong_info(field)
if collected_info:
if random.random() < 0.05:
wrong_field = random.choice(list(collected_info.keys()))
correct_value = collected_info[wrong_field]
collected_info[wrong_field] = self._get_wrong_info(wrong_field)
verification_text = self._safe_llm_call(
prompt="根据以下信息生成确认话术:" + json.dumps(collected_info, ensure_ascii=False),
system="要求1.逐项确认 2.允许修正"
) or f"我确认下:时间:{collected_info.get('时间','')},地点:{collected_info.get('地点','')}..."
phase.append(("客服", "info_verification", verification_text))
if random.random() < 0.3:
correction_field = random.choice(list(collected_info.keys()))
phase.append(("市民", "correction",
f"{correction_field}不对!应该是{self._get_field_example(correction_field)}"))
if random.random() < 0.1:
phase.append(("市民", "angry", "你们连基本信息都记错!"))
phase.append(("客服", "apology", "非常抱歉,这是我们的失误"))
phase.append(("客服", "acknowledge_correction", f"已更正{correction_field}信息"))
phase.append(("市民", "final_confirmation", "现在对了"))
else:
phase.append(("市民", "confirmation", "对,没错"))
return phase
def _generate_technical_extend_phase(self, knowledge: Dict, scene: str) -> List[Tuple]:
"""生成带技术障碍的扩展追问"""
phase = []
for question_config in knowledge.get("extend_questions", []):
if random.random() < 0.05:
tech_question = self._safe_llm_call(
prompt=f"生成包含专业术语的{scene}问题",
system="使用3个以上专业词汇"
) or f"请问{scene}{random.choice(['频谱特征','声压级衰减曲线'])}是怎样的?"
phase.append(("客服", "technical_question", tech_question))
phase.append(("市民", "not_understand", "这些专业名词听不懂"))
simplified = self._safe_llm_call(
prompt=f"'{tech_question}'转化为通俗问题",
system="用生活化比喻解释"
) or f"就是问{scene}的具体表现是怎样的"
phase.append(("客服", "simplified_question", simplified))
else:
question = self._safe_llm_call(
prompt=f"基于{scene}场景生成追问:{question_config.get('prompt','')}",
system="要求1.分步骤询问 2.适度专业"
) or question_config.get('prompt','')
phase.append(("客服", "extend_question", question))
if random.random() < 0.15:
phase.append(("市民", "broken_response", "喂?...听得到吗?...我说到哪了?"))
phase.append(("客服", "reassure", "电话不太稳定,请您继续"))
answer = self._generate_realistic_answer(
question, scene, question_config.get("theme",""), "extend"
)
phase.append(("市民", "extend_answer", answer))
if random.random() < 0.1:
phase.append(("客服", "request_material", "需要您提供现场照片或录音证据"))
phase.append(("市民", "material_response", random.choice([
"我手机里有,怎么发给你们?",
"现在拍不了,你们自己来看!"
])))
phase.append(("客服", "guide", "可以通过微信公众号'市民服务'上传"))
return phase
def _generate_final_confirmation(self, knowledge: Dict, scene: str) -> List[Tuple]:
"""生成最终确认"""
phase = []
confirmation = self._safe_llm_call(
prompt=f"生成{scene}问题的最终确认话术",
system="包含1.处理时限 2.反馈方式 3.应急联系人"
) or f"我们将在{random.choice(['24小时','3个工作日'])}内处理您的{scene}问题"
phase.append(("客服", "final_confirmation", confirmation))
if random.random() < 0.2:
phase.append(("市民", "follow_up", random.choice([
"如果超时没处理怎么办?",
"我要找哪个部门跟进?"
])))
phase.append(("客服", "replay", random.choice([
"可拨打监督电话12345查询进度",
"我们会主动给您回复"
])))
return phase
def _generate_scene_knowledge(self, category: str, subcategory: str) -> Dict:
"""动态生成场景知识图谱"""
prompt = f"""作为政务热线专家,请为【{category}->{subcategory}】场景生成知识配置,包含:
1. 3-5个必问基础字段如时间地点
2. 3个专业追问方向及追问话术模板
3. 该场景涉及的相关部门和处理时限参考
返回JSON格式结构示例
{{
"base_fields": [
{{"field": "时间", "prompt": "询问具体时间的标准话术"}},
{{"field": "地点", "prompt": "询问详细位置的专业话术"}}
],
"extend_questions": [
{{"theme": "历史记录", "prompt": "追问历史投诉情况的专业话术"}},
{{"theme": "紧急程度", "prompt": "评估问题紧急程度的询问方式"}}
],
"departments": ["城管局", "环保局"],
"time_ranges": ["24小时内", "3个工作日"]
}}"""
response = self._safe_llm_call(
prompt=prompt,
system="你是有10年经验的政务热线系统架构师",
response_format={"type": "json_object"}
)
try:
knowledge = json.loads(response)
knowledge["confirmation_template"] = self._generate_confirmation_template(
category, subcategory, knowledge.get("departments", []), knowledge.get("time_ranges", [])
)
return knowledge
except:
return self._get_fallback_knowledge(category, subcategory)
def _generate_confirmation_template(self, category: str, subcategory: str,
departments: List[str], time_ranges: List[str]) -> str:
"""生成确认话术模板"""
prompt = f"""为【{category}->{subcategory}】创建确认话术模板,要求包含:
1. 处理部门{departments}
2. 预计时限{time_ranges}
3. 至少2种后续跟进方式
模板示例\"我们将协调{{department}}在{{timeframe}}内处理,可通过{{phone}}或{{wechat}}查询进展\"
"""
return self._safe_llm_call(
prompt=prompt,
system="你需创建可参数化的文本模板,用{}标记变量位置"
) or f"我们将尽快处理您的{subcategory}问题"
def _generate_realistic_answer(self, question: str, scene: str,
field: str, answer_type: str) -> str:
"""生成高真实性回答"""
prompt = f"""模拟市民对【{scene}】问题中'{question}'的真实回答,要求:
1. 包含具体{field}的细节数据
2. 反映真实诉求和情绪梯度
3. 使用该场景典型市民的语言特征"""
system = {
"base": "你是一个普通市民,回答要口语化并带生活细节",
"extend": "你是有相关专业知识的市民,回答要包含技术参数和量化描述"
}[answer_type]
answer = self._safe_llm_call(prompt=prompt, system=system)
return answer or self._get_field_example(field)
def _get_field_example(self, field: str) -> str:
"""获取字段示例"""
examples = {
"时间": "2023年10月15日下午3点20分",
"地点": "朝阳区建国路88号地下二层停车场",
"联系方式": "13800138000或010-12345678",
"姓氏": "张先生/李女士"
}
return examples.get(field, "具体情况是这样的...")
def _get_fallback_knowledge(self, category: str, subcategory: str) -> Dict:
"""应急知识库"""
return {
"base_fields": [
{"field": "时间", "prompt": f"请问{subcategory}发生的具体时间?"},
{"field": "地点", "prompt": f"请说明{category}问题的详细位置?"}
],
"extend_questions": [
{"theme": "基本情况", "prompt": f"请描述{subcategory}的具体表现?"}
],
"confirmation_template": f"我们将处理您的{category}问题",
"departments": ["相关部门"],
"time_ranges": ["尽快"]
}
def _add_dialect_features(self, text: str) -> str:
"""添加方言特征"""
dialects = {
"北方方言": [("", ""), ("", ""), ("这个", "这玩意儿")],
"南方方言": [("是不是", "系唔系"), ("不知道", "母鸡"), ("", "")]
}
dialect_type, replacements = random.choice(list(dialects.items()))
for orig, rep in replacements:
if orig in text:
return text.replace(orig, rep)
return text + random.choice(["晓得伐?", "中不中?", "得啵?"])
def _get_wrong_info(self, field) -> str:
"""生成错误信息"""
wrong_examples = {
"时间": random.choice(["昨天", "上周", "记不清了"]),
"地点": random.choice(["东边", "路口", "大概位置"]),
"姓氏": random.choice(["", "", ""])
}
return wrong_examples.get(field, "信息有误")
def _safe_llm_call(self, prompt: str, system: str = None,**kwargs) -> str:
"""带熔断机制的API调用"""
try:
messages = [{"role": "user", "content": prompt}]
if system:
messages.insert(0, {"role": "system", "content": system})
response = self.llm.chat.completions.create(
model="gpt-4-turbo",
messages=messages,
temperature=0.7,
max_tokens=400,
**kwargs
)
return response.choices[0].message.content
except Exception as e:
print(f"API异常: {str(e)}")
return ""
def _format_output(self, dialog: List[Tuple]) -> List[Dict]:
"""格式化输出"""
return [{
"turn": idx+1,
"speaker": speaker,
"type": dtype,
"content": content
} for idx, (speaker, dtype, content) in enumerate(dialog)]
if __name__ == "__main__":
os.environ["OPENAI_API_KEY"] = "your-api-key"
generator = FullyDynamicGenerator()
dialog = generator.generate_dialog("城乡建设", "施工噪音")
print("\n=== 政务热线完整对话 ===")
for turn in dialog:
print(f"{turn['turn']}. [{turn['speaker']}][{turn['type']}] {turn['content']}")

View File

@ -0,0 +1,206 @@
from openai import OpenAI, APIError, RateLimitError, AuthenticationError
import csv
import json
import re
import ast
from typing import List, Dict, Tuple
class GovDataGenerator:
def __init__(self, api_key: str):
api_key = "sk-af3d1a3ed4d64df09249383a76fa12f4"
base_url = "https://api.deepseek.com"
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.config_cache = {}
self._init_prompts()
def _init_prompts(self):
"""Initialize all prompt templates"""
self.base_prompt = """请生成与【{title}】相关的5个真实政务12345业务案例每个案例包含
1. 当事人如张先生+ 问题场景如XX街道XX小区
2. 业务类型如社保缴纳+ 具体问题如未回复
3. 涉及单位如医保局+ 时间要素如8月17日
4. 证件编号如370181XXXXXXXXXX+ 政策条件如连续缴费12个月
每个案例生成6-8个关键词格式
[案例1] 关键词关键词1关键词2...
[案例2] 关键词关键词A关键词B...
请用中文逗号分隔不要编号"""
self.config_prompt = """请根据政务领域【{title}】生成:
1. 3-5个核心业务分类categories
2. 1条特别生成要求requirements
示例保险领域
categories: ["医疗保险", "失业保险", "养老保险", "生育保险"]
requirements: "需包含医保报销和生育津贴案例各1个"
请用JSON格式返回{{"categories": [], "requirements": ""}}"""
def _call_gpt(self, prompt: str, **kwargs) -> str:
"""统一GPT调用入口适配v0.28+"""
try:
response = self.client.chat.completions.create(
model="deepseek-chat",
temperature=kwargs.get('temperature', 0.7),
max_tokens=kwargs.get('max_tokens', 800),
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt},
],
stream=False
)
return response.choices[0].message.content
except RateLimitError:
raise Exception(f"请求超频请等待后重试错误码429")
except AuthenticationError:
raise Exception("API密钥无效请检查密钥是否正确")
except APIError as e:
raise Exception(f"API错误: {e.code} - {e.message}")
except Exception as e:
raise Exception(f"请求失败: {str(e)}")
def generate_dynamic_config(self, title: str) -> Tuple[List[str], str]:
"""动态生成领域配置"""
if title in self.config_cache:
return self.config_cache[title]
try:
prompt = self.config_prompt.format(title=title)
raw_text = self._call_gpt(prompt, temperature=0.5, max_tokens=300)
parsed = self.safe_parse_config(raw_text)
# 验证数据结构
if not isinstance(parsed.get("categories", []), list) or \
not isinstance(parsed.get("requirements", ""), str):
raise ValueError("配置格式错误")
self.config_cache[title] = (parsed["categories"], parsed["requirements"])
return self.config_cache[title]
except Exception as e:
print(f"配置生成失败: {str(e)},使用默认配置")
return [], ""
def safe_parse_config(self, text: str) -> Dict:
"""安全解析配置响应"""
try:
# 尝试标准JSON解析
json_str = re.search(r'\{.*\}', text, re.DOTALL).group()
return json.loads(json_str)
except json.JSONDecodeError:
# 尝试容错解析
try:
return ast.literal_eval(json_str.replace('"', "'"))
except:
# 最终容错方案
return {
"categories": list(set(re.findall(r'"([^"]+)"', text)))[:3],
"requirements": re.split(r'[:]', text.split("requirements")[-1])[-1].strip('"\'')
}
except Exception:
return {"categories": [], "requirements": ""}
def get_prompt(self, title: str) -> str:
"""构建动态提示"""
categories, requirements = self.generate_dynamic_config(title)
prompt = self.base_prompt.format(title=title)
if requirements:
prompt += f"\n特别要求:{requirements}"
if categories:
prompt += f"\n参考分类:{', '.join(categories[:3])}..."
return prompt
def generate_keywords(self, title: str) -> List[str]:
"""生成关键词主流程"""
try:
# 获取配置和生成内容
categories, _ = self.generate_dynamic_config(title)
prompt = self.get_prompt(title)
raw_text = self._call_gpt(prompt)
# 处理响应
return self.process_response(raw_text, categories)
except Exception as e:
print(f"关键词生成失败: {str(e)}")
return []
def process_response(self, raw_text: str, categories: List[str]) -> List[str]:
"""处理生成的响应内容"""
# 解析案例
cases = re.findall(r'关键词:(.*?)(?=\n\[案例|\n\n|$)', raw_text, re.DOTALL)
keywords = []
for case in cases:
keywords.extend([k.strip() for k in case.split('') if k.strip()])
# 分级抽样
sampled = []
if categories:
for cat in categories[:3]: # 取前3个分类
matches = [k for k in keywords if cat in k][:5] # 每个分类最多取5个
sampled.extend(matches)
# 合并去重
seen = set()
final = []
for k in sampled + keywords:
if k not in seen:
seen.add(k)
final.append(k)
return self.post_process(final[:60])[:50] # 最终保留50个
def post_process(self, keywords: List[str]) -> List[str]:
"""后处理管道"""
processed = []
for k in keywords:
# 标准化处理
k = re.sub(r'\s+', ' ', k).strip()
# 过滤无效条目
if len(k) < 4 or '...' in k:
continue
# 增强关键条目
if any(c.isdigit() for c in k) or re.search(r'[市区街道]', k):
processed.append(k)
# 优先级排序
priority_terms = ['社保', '医保', '房产证', '施工许可']
return sorted(processed,
key=lambda x: any(t in x for t in priority_terms),
reverse=True)
def save_to_tsv(self, data: List[dict], filename: str):
"""保存结果到TSV文件"""
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(['title', 'contents'])
for item in data:
writer.writerow([
item['title'],
json.dumps(item['keywords'], ensure_ascii=False)
])
if __name__ == "__main__":
# 使用示例
generator = GovDataGenerator("sk-your-api-key-here")
# 生成数据
results = []
for domain in ["保险", "城市管理", "房产"]:
print(f"正在处理:{domain}")
keywords = generator.generate_keywords(domain)
results.append({
"title": domain,
"keywords": keywords
})
print(f"生成完成,获得{len(keywords)}个关键词")
# 保存结果
generator.save_to_tsv(results, "government_data.tsv")
print("数据已保存至 government_data.tsv")