offline_data_model_pipline/data_generate/zw12345/baogao_content_extract.py

154 lines
5.7 KiB
Python
Raw Normal View History

2025-05-12 14:18:19 +08:00
"""
政务12345全国数据生成系统
功能
1. 支持全国范围地理位置生成
2. 多层级分类扩展
3. 数据保存至Excel
4. 真实业务场景模拟
"""
import pandas as pd
from openai import OpenAI
import random
import time
import re
from typing import List, Dict, Tuple
client = OpenAI(api_key="your-api-key")
class NationalDataGenerator:
def __init__(self, excel_path: str, category_column: str):
self.base_categories = self._load_excel_categories(excel_path, category_column)
self.location_pool = self._generate_national_locations()
self.expanded_categories = self._expand_categories_with_gpt()
self.used_records = set()
def _load_excel_categories(self, path: str, column: str) -> List[str]:
"""从Excel读取基础分类"""
df = pd.read_excel(path)
return df[column].dropna().unique().tolist()
def _generate_national_locations(self, num=200) -> List[str]:
"""生成全国真实地理位置库"""
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[{
"role": "user",
"content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城"
}]
)
return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")]
def _expand_categories_with_gpt(self) -> Dict[str, List[str]]:
"""GPT扩展分类体系"""
category_map = {}
for base_cat in self.base_categories:
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[{
"role": "user",
"content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型示例\n- 类型1施工许可违规\n- 类型2夜间施工超时"
}]
)
sub_cats = [re.sub(r".*", "", line.split(" ")[1])
for line in response.choices[0].message.content.strip().split("\n")]
category_map[base_cat] = sub_cats
time.sleep(1)
return category_map
def generate_dataset(self, num_records: int) -> pd.DataFrame:
"""生成核心数据集"""
data = []
while len(data) < num_records:
base_cat = random.choice(self.base_categories)
sub_cat = random.choice(self.expanded_categories[base_cat])
location = random.choice(self.location_pool)
content, keywords = self._generate_content(base_cat, sub_cat, location)
if content and self._validate_record(content, keywords, base_cat):
data.append({
"ID": len(data)+1,
"内容": content,
"关键词": " ".join(keywords),
"参考答案": base_cat,
"细分类型": sub_cat,
"地理位置": location
})
time.sleep(1.2)
return pd.DataFrame(data)
def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]:
"""生成政务工单内容"""
prompt = f"""生成真实可信的12345政务工单要求
1. 主分类{base_cat}
2. 细分类型{sub_cat}
3. 发生地点{location}
4. 包含要素时间具体问题影响范围市民诉求
5. 生成5个关键词必须包含{base_cat}
6. 内容长度80-150
示例格式
市民反映{location}某建筑工地违规夜间施工至凌晨噪音严重干扰周边居民已向环保部门投诉3次未解决要求立即停工整顿
关键词夜间施工 噪音污染 环保投诉 施工许可 居民维权"""
try:
response = client.chat.completions.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "你是政务数据生成专家"},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=600
)
raw_text = response.choices[0].message.content.strip()
return self._parse_generated_text(raw_text)
except Exception as e:
print(f"生成失败:{str(e)}")
return None, []
def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]:
"""解析生成文本"""
content = re.sub(r"关键词:.*", "", text).strip()
keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5]
return content, keywords
def _validate_record(self, content: str, keywords: List[str], category: str) -> bool:
"""五重数据校验"""
return (
len(content) >= 80 and
len(keywords) == 5 and
category in keywords and
content not in self.used_records and
any(c.isdigit() for c in content) # 包含数字要素
)
# 输入文件示例input.xlsx
"""
| 基础分类 |
|--------------|
| 施工管理 |
| 消费维权 |
| 城市管理 |
| 公共服务 |
"""
if __name__ == "__main__":
# 初始化生成器
generator = NationalDataGenerator(
excel_path="input.xlsx",
category_column="基础分类"
)
# 生成100条数据
df = generator.generate_dataset(100)
# 保存到Excel
with pd.ExcelWriter("government_12345_data.xlsx") as writer:
df.to_excel(writer, index=False)
print("生成数据示例:")
print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False))