diff --git a/examples/citations/match_example.py b/examples/citations/match_example.py new file mode 100644 index 0000000..f2fb380 --- /dev/null +++ b/examples/citations/match_example.py @@ -0,0 +1,19 @@ +from gomate.modules.citation.match_citation import MatchCitation +import json + +mc = MatchCitation() + +with open(f'sample5.json', 'r', encoding='utf-8') as f: + input_data = json.load(f) +print(input_data) +result = mc.ground_response( + question=input_data["question"], + response=input_data["response"], + evidences=input_data["evidences"], + selected_idx=input_data["selected_idx"], + markdown=True, + show_code=True, + selected_docs=input_data["selected_docs"], +) + +print(result) diff --git a/examples/citations/sample5.json b/examples/citations/sample5.json new file mode 100644 index 0000000..278001a --- /dev/null +++ b/examples/citations/sample5.json @@ -0,0 +1,166 @@ +{ + "question": "中国共产党的政治理念是什么?", + "response": "\n**总体阐述**:\n\n中国共产党的政治理念以人民为中心,强调执政为民、为人类进步事业奋斗,坚定理想信念,注重党的政治建设与自我革新。习近平强调,要始终把人民放在心中最高的位置,坚持和发展中国特色社会主义,保持党的先进性和纯洁性。\n\n**详细分析**:\n\n- **执政为民**:习近平指出,“中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。”这体现了党将人民的利益放在首位,致力于实现人民对美好生活的向往。\n \n- **为人类进步事业奋斗**:习近平强调,“中国共产党是为中国人民谋幸福的政党,也是为人类进步事业而奋斗的政党。”这表明党不仅关注中国人民的幸福,也致力于为全人类的共同福祉作出贡献。\n\n- **坚定理想信念**:习近平指出,“对马克思主义的信仰和对社会主义和共产主义的信念是共产党人的政治灵魂。”这强调了坚定理想信念对于共产党人来说至关重要。\n\n- **政治建设与自我革新**:习近平强调,“党的领导必须是全面的、系统的、整体的”,并要求全党加强党性锻炼,提高政治觉悟和政治能力。\n\n- **治国理政的本根**:习近平指出,“我们治国理政的本根,就是中国共产党的领导和我国社会主义制度。”这强调了党的领导和社会主义制度在治国理政中的核心地位。\n\n- **思想政治建设**:习近平强调,“加强思想政治建设,解决好世界观、人生观、价值观这个‘总开关’问题。”这体现了党对思想政治建设的重视。\n\n- **马克思主义指导地位**:习近平指出,“我们共产党人是坚定的马克思主义者”,并强调党的指导思想是马克思列宁主义、毛泽东思想和中国特色社会主义理论体系。\n\n- **不忘初心**:习近平强调,“不忘初心,方得始终”,要求党员牢记党的宗旨和使命。", + "evidences": [ + "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "中国共产党是为中国人民谋幸福的政党,也是为人类进步事业而奋斗的政党。中国共产党始终把为人类作出新的更大的贡献作为自己的使命。", + "革命理想高于天。共产主义远大理想和中国特色社会主义共同理想,是中国共产党人的精神支柱和政治灵魂,也是保持党的团结统一的思想基础。要把坚定理想信念作为党的思想建设的首要任务,教育引导全党牢记党的宗旨,挺起共产党人的精神脊梁,解决好世界观、人生观、价值观这个“总开关”问题,自觉做共产主义远大理想和中国特色社会主义共同理想的坚定信仰者和忠实实践者。", + "把党的政治建设摆在首位。旗帜鲜明讲政治是我们党作为马克思主义政党的根本要求。党的政治建设是党的根本性建设,决定党的建设方向和效果。保证全党服从中央,坚持党中央权威和集中统一领导,是党的政治建设的首要任务。全党要坚定执行党的政治路线,严格遵守政治纪律和政治规矩,在政治立场、政治方向、政治原则、政治道路上同党中央保持高度一致。要尊崇党章,严格执行新形势下党内政治生活若干准则,增强党内政治生活的政治性、时代性、原则性、战斗性,自觉抵制商品交换原则对党内生活的侵蚀,营造风清气正的良好政治生态。完善和落实民主集中制的各项制度,坚持民主基础上的集中和集中指导下的民主相结合,既充分发扬民主,又善于集中统一。弘扬忠诚老实、公道正派、实事求是、清正廉洁等价值观,坚决防止和反对个人主义、分散主义、自由主义、本位主义、好人主义,坚决防止和反对宗派主义、圈子文化、码头文化,坚决反对搞两面派、做两面人。全党同志特别是高级干部要加强党性锻炼,不断提高政治觉悟和政治能力,把对党忠诚、为党分忧、为党尽职、为民造福作为根本政治担当,永葆共产党人政治本色。", + "全体共产党员特别是党的领导干部,要坚定理想信念,始终把人民放在心中最高的位置,弘扬党的光荣传统和优良作风,坚决反对形式主义、官僚主义,坚决反对享乐主义、奢靡之风,坚决同一切消极腐败现象作斗争,永葆共产党人政治本色,矢志不移为党和人民事业而奋斗。", + "党性说到底就是立场问题。共产党人无论是想问题、搞研究,还是作决策、办事情,都必须站在党和人民立场上,而不能把个人利益放在第一位。这就是共产党人的党性原则。", + "“治国犹如栽树,本根不摇则枝叶茂荣。”我们治国理政的本根,就是中国共产党的领导和我国社会主义制度。在这一点上,必须理直气壮、旗帜鲜明。党的领导必须是全面的、系统的、整体的,必须体现到经济建设、政治建设、文化建设、社会建设、生态文明建设和国防军队、祖国统一、外交工作、党的建设等各方面。哪个领域、哪个方面、哪个环节缺失了弱化了,都会削弱党的力量,损害党和国家事业。", + "理想信念是共产党人的精神之“钙”,必须加强思想政治建设,解决好世界观、人生观、价值观这个“总开关”问题。对共产党人来说,理想信念是精神之“钙”,精神上缺了“钙”,就会得“软骨病”,就会导致政治上变质、经济上贪婪、道德上堕落、生活上腐化。“四风”问题归根到底是理想信念出现动摇所致。“石可破也,而不可夺坚;丹可磨也,而不可夺赤。”必须毫不放松抓好思想政治建设,点亮党员、干部心中的明灯,教育引导党员、干部筑牢思想防线,坚持“革命理想高于天”,保持蓬勃朝气、昂扬锐气、浩然正气。", + "我们共产党人是坚定的马克思主义者,我们党的指导思想就是马克思列宁主义、毛泽东思想和中国特色社会主义理论体系。同时,我们不是历史虚无主义者,也不是文化虚无主义者,不能数典忘祖、妄自菲薄。", + "不忘初心,方得始终。对马克思主义的信仰,对社会主义和共产主义的信念,是共产党人的政治灵魂,是共产党人经受住各种考验的精神支柱。只有理想信念坚定的人,才能始终不渝、百折不挠,不论风吹雨打,不怕千难万险,坚定不移为实现既定目标而奋斗。今天,每一个共产党员都要做共产主义远大理想和中国特色社会主义共同理想的坚定信仰者、忠实实践者,为实现“两个一百年”奋斗目标、实现中华民族伟大复兴的中国梦而英勇奋斗。" + ], + "selected_idx": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11 + ], + "selected_docs": [ + { + "file_name": "362.json", + "content": "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "chk_id": 1, + "doc_id": 206933, + "newsinfo": { + "source": "《人民日报》", + "date": "2014年2月7日", + "title": "《在俄罗斯索契接受俄罗斯电视台专访时的答问》" + } + }, + { + "file_name": "362.json", + "content": "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "chk_id": 4, + "doc_id": 206933, + "newsinfo": { + "source": "《人民日报》", + "date": "2014年2月7日", + "title": "《在俄罗斯索契接受俄罗斯电视台专访时的答问》" + } + }, + { + "file_name": "362.json", + "content": "中国共产党坚持执政为民,人民对美好生活的向往就是我们的奋斗目标。我的执政理念,概括起来说就是:为人民服务,担当起该担当的责任。", + "chk_id": 7, + "doc_id": 206933, + "newsinfo": { + "source": "《人民日报》", + "date": "2014年2月7日", + "title": "《在俄罗斯索契接受俄罗斯电视台专访时的答问》" + } + }, + { + "file_name": "1249.json", + "content": "中国共产党是为中国人民谋幸福的政党,也是为人类进步事业而奋斗的政党。中国共产党始终把为人类作出新的更大的贡献作为自己的使命。", + "chk_id": 25, + "doc_id": 207820, + "newsinfo": { + "source": "人民出版社单行本", + "date": "2017年10月18日", + "title": "《决胜全面建成小康社会,夺取新时代中国特色社会主义伟大胜利》" + } + }, + { + "file_name": "1249.json", + "content": "革命理想高于天。共产主义远大理想和中国特色社会主义共同理想,是中国共产党人的精神支柱和政治灵魂,也是保持党的团结统一的思想基础。要把坚定理想信念作为党的思想建设的首要任务,教育引导全党牢记党的宗旨,挺起共产党人的精神脊梁,解决好世界观、人生观、价值观这个“总开关”问题,自觉做共产主义远大理想和中国特色社会主义共同理想的坚定信仰者和忠实实践者。", + "chk_id": 44, + "doc_id": 207820, + "newsinfo": { + "source": "人民出版社单行本", + "date": "2017年10月18日", + "title": "《决胜全面建成小康社会,夺取新时代中国特色社会主义伟大胜利》" + } + }, + { + "file_name": "1249.json", + "content": "把党的政治建设摆在首位。旗帜鲜明讲政治是我们党作为马克思主义政党的根本要求。党的政治建设是党的根本性建设,决定党的建设方向和效果。保证全党服从中央,坚持党中央权威和集中统一领导,是党的政治建设的首要任务。全党要坚定执行党的政治路线,严格遵守政治纪律和政治规矩,在政治立场、政治方向、政治原则、政治道路上同党中央保持高度一致。要尊崇党章,严格执行新形势下党内政治生活若干准则,增强党内政治生活的政治性、时代性、原则性、战斗性,自觉抵制商品交换原则对党内生活的侵蚀,营造风清气正的良好政治生态。完善和落实民主集中制的各项制度,坚持民主基础上的集中和集中指导下的民主相结合,既充分发扬民主,又善于集中统一。弘扬忠诚老实、公道正派、实事求是、清正廉洁等价值观,坚决防止和反对个人主义、分散主义、自由主义、本位主义、好人主义,坚决防止和反对宗派主义、圈子文化、码头文化,坚决反对搞两面派、做两面人。全党同志特别是高级干部要加强党性锻炼,不断提高政治觉悟和政治能力,把对党忠诚、为党分忧、为党尽职、为民造福作为根本政治担当,永葆共产党人政治本色。", + "chk_id": 45, + "doc_id": 207820, + "newsinfo": { + "source": "人民出版社单行本", + "date": "2017年10月18日", + "title": "《决胜全面建成小康社会,夺取新时代中国特色社会主义伟大胜利》" + } + }, + { + "file_name": "561.json", + "content": "全体共产党员特别是党的领导干部,要坚定理想信念,始终把人民放在心中最高的位置,弘扬党的光荣传统和优良作风,坚决反对形式主义、官僚主义,坚决反对享乐主义、奢靡之风,坚决同一切消极腐败现象作斗争,永葆共产党人政治本色,矢志不移为党和人民事业而奋斗。", + "chk_id": 15, + "doc_id": 207132, + "newsinfo": { + "source": "《人民日报》", + "date": "2013年3月17日", + "title": "《在第十二届全国人民代表大会第一次会议上的讲话》" + } + }, + { + "file_name": "763.json", + "content": "党性说到底就是立场问题。共产党人无论是想问题、搞研究,还是作决策、办事情,都必须站在党和人民立场上,而不能把个人利益放在第一位。这就是共产党人的党性原则。", + "chk_id": 26, + "doc_id": 207334, + "newsinfo": { + "source": "《十八大以来重要文献选编》(上)", + "date": "2014年1月14日", + "title": "《严明党的组织纪律,增强组织纪律性》" + } + }, + { + "file_name": "750.json", + "content": "“治国犹如栽树,本根不摇则枝叶茂荣。”我们治国理政的本根,就是中国共产党的领导和我国社会主义制度。在这一点上,必须理直气壮、旗帜鲜明。党的领导必须是全面的、系统的、整体的,必须体现到经济建设、政治建设、文化建设、社会建设、生态文明建设和国防军队、祖国统一、外交工作、党的建设等各方面。哪个领域、哪个方面、哪个环节缺失了弱化了,都会削弱党的力量,损害党和国家事业。", + "chk_id": 0, + "doc_id": 207321, + "newsinfo": { + "source": "《十九大以来重要文献选编》(上)", + "date": "2018年2月28日", + "title": "《切实把思想统一到党的十九届三中全会精神上来》" + } + }, + { + "file_name": "743.json", + "content": "理想信念是共产党人的精神之“钙”,必须加强思想政治建设,解决好世界观、人生观、价值观这个“总开关”问题。对共产党人来说,理想信念是精神之“钙”,精神上缺了“钙”,就会得“软骨病”,就会导致政治上变质、经济上贪婪、道德上堕落、生活上腐化。“四风”问题归根到底是理想信念出现动摇所致。“石可破也,而不可夺坚;丹可磨也,而不可夺赤。”必须毫不放松抓好思想政治建设,点亮党员、干部心中的明灯,教育引导党员、干部筑牢思想防线,坚持“革命理想高于天”,保持蓬勃朝气、昂扬锐气、浩然正气。", + "chk_id": 17, + "doc_id": 207314, + "newsinfo": { + "source": "《党建研究》", + "date": "2014年1月20日", + "title": "《在党的群众路线教育实践活动第一批总结暨第二批部署会议上的讲话》" + } + }, + { + "file_name": "414.json", + "content": "我们共产党人是坚定的马克思主义者,我们党的指导思想就是马克思列宁主义、毛泽东思想和中国特色社会主义理论体系。同时,我们不是历史虚无主义者,也不是文化虚无主义者,不能数典忘祖、妄自菲薄。", + "chk_id": 0, + "doc_id": 206985, + "newsinfo": { + "source": "《人民日报》", + "date": "2014年10月13日", + "title": "《在十八届中央政治局第十八次集体学习时的讲话》" + } + }, + { + "file_name": "1270.json", + "content": "不忘初心,方得始终。对马克思主义的信仰,对社会主义和共产主义的信念,是共产党人的政治灵魂,是共产党人经受住各种考验的精神支柱。只有理想信念坚定的人,才能始终不渝、百折不挠,不论风吹雨打,不怕千难万险,坚定不移为实现既定目标而奋斗。今天,每一个共产党员都要做共产主义远大理想和中国特色社会主义共同理想的坚定信仰者、忠实实践者,为实现“两个一百年”奋斗目标、实现中华民族伟大复兴的中国梦而英勇奋斗。", + "chk_id": 2, + "doc_id": 207841, + "newsinfo": { + "source": "人民出版社单行本", + "date": "2016年11月29日", + "title": "《在纪念朱德同志诞辰一百三十周年座谈会上的讲话》" + } + } + ] +} \ No newline at end of file diff --git a/gomate/modules/citation/match_citation.py b/gomate/modules/citation/match_citation.py index ec9ccd9..1181138 100644 --- a/gomate/modules/citation/match_citation.py +++ b/gomate/modules/citation/match_citation.py @@ -1,9 +1,11 @@ import json from typing import List -from gomate.modules.document.utils import PROJECT_BASE + import jieba import loguru +from gomate.modules.document.utils import PROJECT_BASE + class MatchCitation: def __init__(self): @@ -54,9 +56,17 @@ def remove_stopwords(self, query: str): query = query.replace(word, " ") return query + def highlight_common_substrings(self, sentence, evidence_sentence, evidence, min_length=6): + evidence_sentences = self.cut(evidence) + current_sentence_index = next(i for i, s in enumerate(evidence_sentences) if evidence_sentence == s) + highlighted_text = evidence_sentences[current_sentence_index] + start_evidence = evidence.index(highlighted_text) + end_evidence = start_evidence + len(highlighted_text) + return [[start_evidence, end_evidence - 1]] + def ground_response( self, - question:str, + question: str, response: str, evidences: List[str], selected_idx: List[int], @@ -74,103 +84,139 @@ def ground_response( "selected_idx": selected_idx, "selected_docs": selected_docs } - - # Log using loguru - # loguru.logger.info(f"Response: {response}") - # loguru.logger.info(f"Evidences: {evidences}") - # loguru.logger.info(f"Selected indices: {selected_idx}") - # loguru.logger.info(f"Selected documents: {selected_docs}") - # Save to JSON file - output_file = "citation.json" - with open("citation.json", 'w', encoding='utf-8') as f: + output_file = "citation_match.json" + with open(output_file, 'w', encoding='utf-8') as f: json.dump(json_data, f, ensure_ascii=False, indent=4) - loguru.logger.info(f"Parameters saved to {output_file}") - - print(response) sentences = self.cut(response) - print(sentences) - selected_idx = [i - 1 for i in selected_idx] - quote_list = [] - final_response = [] - quote_index_map = {} # To keep track of existing quotes - best_idx = 0 - - for sentence in sentences: - print("===================sentence", sentence) - if not sentence.strip(): - # continue - final_response.append(sentence) - else: - sentence_seg_cut = set(jieba.lcut(self.remove_stopwords(sentence))) - sentence_seg_cut_length = len(sentence_seg_cut) - threshold = 0.6 - final_response.append(f"{sentence}") - group_list = [] - for i, idx in enumerate(selected_idx): - evidence = evidences[i] - evidence_sentences = self.cut(evidence) - for j, evidence_sentence in enumerate(evidence_sentences): + contents = [{"content": sentence} for sentence in sentences] + for cit_idx, citation in enumerate(contents): + citation['citation_content'] = [] + citation['best_idx'] = [] + citation['best_ratio'] = [] + citation['highlighted_start_end'] = [] + sentence = citation['content'] + # print("===================sentence", sentence) + # 答案内容进行分词 + sentence_seg_cut = set(jieba.lcut(self.remove_stopwords(sentence))) + sentence_seg_cut_length = len(sentence_seg_cut) + threshold = 0.5 + # 检索内容 + for doc_idx, doc in enumerate(selected_docs): + evidence_sentences = self.cut(doc['content']) + for es_idx, evidence_sentence in enumerate(evidence_sentences): + ## 可能存在空的片段 + if evidence_sentence.strip() and sentence.strip() : evidence_seg_cut = set(jieba.lcut(self.remove_stopwords(evidence_sentence))) overlap = sentence_seg_cut.intersection(evidence_seg_cut) ratio = len(overlap) / sentence_seg_cut_length + # print(sentence_seg_cut,evidence_seg_cut,ratio) + if ratio > threshold: best_ratio = ratio - highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence, evidence) + best_idx = doc_idx + best_sentence = evidence_sentence + highlighted_start_end = self.highlight_common_substrings(sentence, evidence_sentence, + doc['content']) + if best_idx not in citation['best_idx']: + citation['citation_content'].append(doc['content']) + citation['best_idx'].append(best_idx) + citation['best_ratio'].append(best_ratio) + citation['highlighted_start_end'].append(highlighted_start_end) + print(contents) + + citation_cnt = 0 + is_citation_exists = [] + for citation in contents: + best_idx = citation['best_idx'] + if best_idx not in is_citation_exists: + is_citation_exists.append(best_idx) + citation_cnt += 1 + + is_content_exists = [] + final_response = [] + quote_list = [] + best_indices = 0 + + is_group_exists=[] + for citation_idx, citation in enumerate(contents): + final_response.append(f"{citation['content']}") + + best_idxes = citation['best_idx'] + if len(best_idxes) > 0: + is_doc_id_exists = [] + group_list = [] + # 判断当前一组引用是否被当前段落引用过 + if best_idxes not in is_content_exists: + for idx, best_idx in enumerate(best_idxes): + # 判断当前组是否存在重复文档 + if selected_docs[best_idx]["doc_id"] not in is_doc_id_exists: group_item = { - "doc_id": selected_docs[i]["doc_id"], - "chk_id": selected_docs[i]["chk_id"], - "doc_source": selected_docs[i]["newsinfo"]["source"], - "doc_date": selected_docs[i]["newsinfo"]["date"], - "doc_title": selected_docs[i]["newsinfo"]["title"], - "chk_content": evidence, - "best_ratio": best_ratio, - "highlight": highlighted_start_end, + "doc_id": selected_docs[best_idx]["doc_id"], + "chk_id": selected_docs[best_idx]["chk_id"], + "doc_source": selected_docs[best_idx]["newsinfo"]["source"], + "doc_date": selected_docs[best_idx]["newsinfo"]["date"], + "doc_title": selected_docs[best_idx]["newsinfo"]["title"], + # "chk_content": selected_docs[best_idx]['content'], + "chk_content": citation['citation_content'][idx], + "best_ratio": citation['best_ratio'][idx], + "highlight": citation['highlighted_start_end'][idx], } group_list.append(group_item) + is_doc_id_exists.append(selected_docs[best_idx]["doc_id"]) + # 合并引用 + group_list.sort(key=lambda x: x['best_ratio'], reverse=True) + + merged_group_list = [] + reference = group_list[0] + reference_tokens = set(jieba.lcut(self.remove_stopwords(reference['chk_content']))) + merged_group = [reference] + for item in group_list[1:]: + item_tokens = set(jieba.lcut(self.remove_stopwords(item['chk_content']))) + if len(reference_tokens.intersection(item_tokens)) > 5: + merged_group.append(item) + else: + merged_group_list.append([item]) + # merged_group = [item] + if merged_group: + merged_group_list.append(merged_group) + for group in merged_group_list: + group_data={ + "doc_list": group, + "chk_content": group[0]["chk_content"], + "highlight": group[0]["highlight"], + } + doc_id_list=[doc['doc_id'] for doc in group_data['doc_list']] + # print(doc_id_list) + if doc_id_list not in is_group_exists: + quote_list.append(group_data) + best_indices += 1 + final_response.append(f"{[best_indices]}") + is_group_exists.append(doc_id_list) + else: + # print("已存在") + final_response.append(f"{[is_group_exists.index(doc_id_list)+1]}") + + data = {'result': ''.join(final_response), 'quote_list': quote_list, 'summary': ''} + # Save to JSON file + json_data['result'] = ''.join(final_response) + json_data['quote_list'] = quote_list + output_file = "citation_match_res.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(json_data, f, ensure_ascii=False, indent=4) - if group_list: - # Create a unique key for the group_list based on its content - group_key = tuple(sorted((item["doc_id"], item["chk_id"]) for item in group_list)) - - if group_key in quote_index_map: - # If this group already exists, use its index - existing_idx = quote_index_map[group_key] - final_response.append(f"[{existing_idx}]") - else: - # If this is a new group, add it to quote_list and update the index - best_idx += 1 - quote_index_map[group_key] = best_idx - quote_list.append({ - "doc_list": group_list, - "chk_content": group_list[0]["chk_content"], - "highlight": group_list[0]["highlight"], - }) - final_response.append(f"[{best_idx}]") - - # final_response.append("。") - # final_response.append("\n") - # print(''.join(final_response)) - data = {'result': ''.join(final_response), 'quote_list': quote_list,'summary':''} + loguru.logger.info(f"Parameters saved to {output_file}") + print(json_data) return data - def highlight_common_substrings(self, sentence, evidence_sentence, evidence, min_length=6): - evidence_sentences = self.cut(evidence) - current_sentence_index = next(i for i, s in enumerate(evidence_sentences) if evidence_sentence == s) - highlighted_text = evidence_sentences[current_sentence_index] - start_evidence = evidence.index(highlighted_text) - end_evidence = start_evidence + len(highlighted_text) - return [[start_evidence, end_evidence-1]] - if __name__ == '__main__': mc = MatchCitation() - with open(f'{PROJECT_BASE}/data/docs/citations_samples/sample1.json','r',encoding='utf-8') as f: - input_data =json.load(f) - print(input_data) + with open(f'{PROJECT_BASE}/data/docs/citations_samples/sample19.json', 'r', encoding='utf-8') as f: + input_data = json.load(f) result = mc.ground_response( question=input_data["question"], response=input_data["response"], diff --git a/gomate/modules/citation/source_citation.py b/gomate/modules/citation/source_citation.py index f90ee39..df2a5b0 100644 --- a/gomate/modules/citation/source_citation.py +++ b/gomate/modules/citation/source_citation.py @@ -157,7 +157,7 @@ def ground_response( sentence_seg_cut = set(jieba.lcut(self.remove_stopwords(sentence))) sentence_seg_cut_length = len(sentence_seg_cut) - threshold = 0.2 + threshold = 0.3 # 检索内容 for doc_idx, doc in enumerate(selected_docs): evidence_sentences = self.cut(doc['content']) @@ -193,19 +193,16 @@ def ground_response( quote_list = [] best_indices = 0 - for citation in contents: - is_doc_id_exists = [] - group_list = [] - + for citation_idx,citation in enumerate(contents): if citation_cnt > 1: - citation['title'] = self.convert_to_chinese(str(best_indices + 1)) + '、' + citation['title'] - citation['title'] = "**" + citation['title'] + "**" - else: - citation['title'] = "**" + citation['title'] + "**" + citation['title'] = self.convert_to_chinese(str(citation_idx + 1)) + '、' + citation['title'] + citation['title'] = "**" + citation['title'] + "**" + final_response.append(f"{citation['title']}") best_idxes = citation['best_idx'] print(best_idxes) - + is_doc_id_exists = [] + group_list = [] # 判断当前一组引用是否被当前段落引用过 if best_idxes not in is_content_exists: for idx, best_idx in enumerate(best_idxes): @@ -225,17 +222,42 @@ def ground_response( group_list.append(group_item) is_doc_id_exists.append(selected_docs[best_idx]["doc_id"]) - quote_list.append({ - "doc_list": group_list, - "chk_content": group_list[0]["chk_content"], - "highlight": group_list[0]["highlight"], - }) - best_indices += 1 - final_response.append(f"{citation['title']}{[best_indices]}\n\n") - # final_response.append(f"{citation['title']}\n") - # final_response.append(f"\n{citation['content']}{[best_indices]}\n\n") - - is_content_exists.append(best_idxes) + # 合并引用 + group_list.sort(key=lambda x: x['best_ratio'], reverse=True) + + merged_group_list = [] + reference = group_list[0] + reference_tokens = set(jieba.lcut(self.remove_stopwords(reference['chk_content']))) + merged_group = [reference] + # print(len(group_list)) + for item in group_list[1:]: + item_tokens = set(jieba.lcut(self.remove_stopwords(item['chk_content']))) + if len(reference_tokens.intersection(item_tokens)) > 15: + merged_group.append(item) + else: + merged_group_list.append([item]) + # merged_group = [item] + if merged_group: + print(len(merged_group)) + merged_group_list.append(merged_group) + for group in merged_group_list: + quote_list.append({ + "doc_list": group, + "chk_content": group[0]["chk_content"], + "highlight": group[0]["highlight"], + }) + + best_indices += 1 + final_response.append(f"{[best_indices]}") + # quote_list.append({ + # "doc_list": group_list, + # "chk_content": group_list[0]["chk_content"], + # "highlight": group_list[0]["highlight"], + # }) + # best_indices += 1 + # final_response.append(f"{citation['title']}{[best_indices]}\n\n") + # + # is_content_exists.append(best_idxes) data = {'result': ''.join(final_response), 'quote_list': quote_list, 'summary': response['summary']} # Save to JSON file @@ -266,4 +288,4 @@ def ground_response( selected_docs=input_data["selected_docs"], ) - # print(result) + # print(result) \ No newline at end of file diff --git a/gomate/modules/clusters/corpus.py b/gomate/modules/clusters/corpus.py new file mode 100644 index 0000000..71c77e4 --- /dev/null +++ b/gomate/modules/clusters/corpus.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python +# -*- coding:utf-8 _*- +""" +@author:quincy qiang +@license: Apache Licence +@file: corpus.py +@time: 2021/06/16 +@contact: yanqiangmiffy@gamil.com +@software: PyCharm +@description: coding.. +""" +import pandas as pd + +documents = [[60202106020360275, '昨晚六合彩开奖结果', '昨晚六合彩开奖结果'], + [60202106020386217, '周二主要经济数据', '#EJ GLOBAL金融 - 周二主要经济数据\n'], + [60202106020360482, + '今、明两天电视节目精选', + '周三4:50pm now670 、671 、有线651及652台法国网球公开赛周三6:00pm 有线662台日职: FC横滨对川崎前锋周三6:00pm 有线661台FIVB世界女排联?:泰国对美国周三10:00pm 有线661台FIVB世界女排联?:中国对土耳其周四1:00am有 线662台国际赛:挪威对卢森堡周四1:00am now680台EDGE SPORT号周四3:00am有线661台FIVB世界女排联?:巴西对意大利推号周四3:00am有 线662台价国际?:英格兰对奥地利'], + [60202106020013837, + '灵活提升使用率 增康体场地供应', + '香港的“生存空间”问题远不止于居住用地,全港绝大部分行业也缺乏土地。当社会各界警告本港未来5年的新建房屋量将出现“断崖式下跌”时,康文署辖下的康乐和体育场地数量,过去5年也是接近零增长。康乐用地的需求结构与商住用地不同,用户主要是以“小时”租用,而非商住用地般以年计算所以康乐场地要增加供应,可以侧重于“时间”,即只须提升现有场地的使用效率,就能达到增加供应的目的。香港很多中小学校的体育设施,在平8晚间和周六日的“繁忙时!间”空置,可免费开放或取酬出租予社会人士,释放康乐空间。若此模式经营得当,更可仿效北美高等院校的做法,将出租场地和举办体育活动的收入用作建立奖学金,以及投资改善校园设施。\n日前,商务及经济发展局局长邱腾华表示,预计最快7月底容许邮轮公司复办航程“没有特定目的地”的“公海游”,供港人参加。不少业界人士随即表示支持,并强调不会聘用仍属高风险地区的船员营运,希望尽快成事。\n笔者认为,推动“公海游”的意义并不止于业界“成功争取”,更显示出特区政府看待休闲产业在社会中的地位,已出现范式转移。简单地说,容许“无目的地”邮轮开航,代表政府认同交通工具的存在价值,不只是为了前往目的地,“享受过程”也可以是同样重要。\n此前,运房局已于上月中建议修订《道路交通条例》,支持香港自动驾驶行业的发展,可见在政府的观点中,港人使用交通工具之目的,不再止于“前往目的地”。增加市民在驾驶过程中的舒适度,也开始成为政府施政的考虑因素,甚至可成为未来经济发展的方向。Tesla豪华电动车5年前成功在港扎根后,经由香港输出到全世界,其实也是成功例子。\n特区政府对港人休闲娱乐、生活态度的重视程度,其实自往年“封关”后,已逐步提升,例如特首林郑月娥在去年《施政报告》中宣布港珠澳大桥香港车免中港牌北上、海洋公园改革、“跃动港岛南”计划、以至本文所说的自动驾驶技术修例,或多或少也是受到港人疫下缺乏娱乐的悲愤情绪所驱动。难得政府在改革方向上已开了头,我们今日应抓紧机会,突破既有框框,对康乐产业的改革多加讨论。\n增使用时间 换活动空间\n香港的“生存空间”问题远不止于居住用地,全港绝大部分行业,包括康乐体育,也缺乏土地。当社会各界警告本港未来5年的新建房屋量将出现“断崖式下跌”时,康文署辖下的康乐和体育场地数量,过去5年基本上也是接近零增长,至近期政府对康乐行业“上有政策”后,康文署才开始对辖下设施实行“供应侧改革”,例如5月初宣布打击“炒场”等新措施。\n如果以传统城市规划、绘制地图的“平面”概念去想,康体设施的供应管理其实是在占用珍贵的土地资源,所以近年一直不获关注。但康乐用地的需求结构与商住用地不同,用户主要是以“小时”租用,而非商住用地般以年计算,所以康乐场地要增加供应,侧重点并不一定在“空间”,而可以是在于“时间”。用公共政策研究的语言来说,即是只须提升现有场地的使用效率,就能达到增加供应的目的。\n中小学出租体育设施 可常规化\n香港很多中小学校的体育设施,在平日上课期间,表面上已无空档,但其实大部分学生都只会在平日日间,即康文署定义的“非繁忙时间”内使用场地(平日晚间和周六日则为“繁忙时间”)。全港1,000间中小学,加起来就有最少千多个球场空置,与康文署在场地供应上有极大互补性。\n一些学校其实亦已有既定做法,在平日晚间和周末日间,将场地借予旧生使用。会将这种做法常规化的学校不多,主要是看旧生会规模,如果和校方协调能力较强便会安排,官校的例子有皇仁书院;英基辖下英皇佐治五世学校的球场设施则明码实价,对外开放,经审批就可租用。无论是免费开放还是出租取酬,中小学校于办公时间后释放康乐空间予社会人士的做法,都应该推广。\n今日大部分香港人都受过9年或以上免费教育,要全港中小学常规性出租康乐场地予旧生及其家属,在保安和行政上其实不难。假以时日,若此模式经营得当,更可仿效北美高等院校的做法,将出租场地和举办体育活动的收入用作建立奖学金,以及投资改善校园设施,为人口老化下日薄西山的香港教育行业注入新动力。\n运动场草地 可开放出租\n同理,康文署的25个室外运动场,主要使用者也是中小学,每年租两个早上,用来举办运动会。和校内设施一样,运动场的所谓“使用率高”也只是平日办公时间内使用率高,平日晚上和周末(亦即其他设施的“繁忙时间”)使用率其实甚低,最多是免费开放作缓跑径,中间的大片草地更是只作观赏用途。\n其实该类场地亦可以“时间维度”改革,大幅增加供应。以笔者熟悉的马球为例,本港的马球队以业余球员为骨干,一般练习时间总是在上班前的清晨、下班后的黄昏和周末长假期,而且只使用中场草地,与学校运动会市民公余跑步并无竞争。配合“跃动港岛南”新政策,只要在香港仔运动场练习、马匹寄养在本来就有动物设施的海洋公园便可,基本上毋须动用公帑增设任何基建和改变土地用途,已可进行“供应侧改革”。\n至于草地的保养成本和技术问题,也是可以用钱解决,例如马球会租用运动场草地收取较高租金,亦可以较高租金出租予其他球类活动。对于运动场的草地,社会明显有需求,政府只需要增加租用模式和租金去抵销草地的保养成本便可。如此既可改善市民在香港生活的幸福感,又可制造相关的辅助专业职位,一箭双鵰,何乐而不为?\n刘国匡 时事评论员\n'], + [60202106020014271, + 'Activity at factories surges as costs bite', + 'Production expands atfastest pace this year,but inflation pressuresare seen to take tollReutersChina’s factory activity expandedat the fastest pace this year in Mayas domestic and export demandpicked up, though sharp rises inraw material prices and strains insupply chains crimped somecompanies’ production, abusiness survey showedyesterday.The Caixin/Markit manufacturingpurchasing managers’ index(PMI) rose to 52.0 last month,the highest level since Decemberand inching up from April’s 51.9.Analysts polled by Reuters hadexpected the index to remain at51.9. The 50-mark separatesgrowth from contraction on amonthly basis.New orders rose at the strongestpace so far this year and agauge for export orders was thehighest since November, but theoutput reading, while still solid,was slightly lower than theprevious month.“Rapidly rising commodityprices began to disrupt the economyas some enterprises began tohoard goods, while some otherssuffered raw material shortages.Supply chains were also significantlyaffected,” Wang Zhe, senioreconomist at Caixin InsightGroup, said.A subindex for input costsexpanded at the fastest pace since2016. Manufacturers passed onsome of the pressure to their customers,with a gauge for outputprices rising at the quickest pacein a decade. Charges for exportedgoods rose at the fastest rate inthree years.Prices for commodities suchas coal, steel, iron ore and copperhave surged this year, fuelled bythe lifting of pandemic lockdownsin many countries and ampleglobal liquidity.Policymakers have repeatedlyexpressed concern about risingcommodity prices in recentweeks and called for stricter managementof supply and demandand to crack down on “maliciousspeculation”.The warnings have promptedsome pullback in metals pricesbut analysts are unsure how longsuch a correction may last givenimproving global demand.Firms continued to hire but ata slower pace, with some enterprisessaying they were trying toreduce costs.Earnings at industrial firmsgrew at a slower pace in April, withhigh commodity prices and weakerperformance in the consumergoods sector limiting overall profitabilityfrom manufacturing.The economy posted recordgrowth in the first quarter as itrecovered quickly from thepandemic, although analystsexpected the brisk expansion tomoderate later this year.Rapidly risingcommodityprices beganto disruptthe economyWANG ZHE, SENIOR ECONOMIST'], + [60202106020013873, + 'Police hold 11 over plot to attack churches', + 'Associated Press in JakartaIndonesian police have said theyarrested 11 suspected Islamicmilitants accused of plottingattacks at several Christianchurches in easternmost Papuaprovince.On Friday, the elite counterterrorismsquad arrested 10 suspectsin several raids in Papua’sMerauke district after receivinginformation about plannedattacks in the province, a predominantlyChristian region in Muslim-majority Indonesia, Meraukepolice chief Untung Sangaji said.The arrests led police to an-other suspect who was detainedon Sunday, and led them to seizeitems including chemicals forexplosives, modified air guns ableto fire real bullets, jihadist booksand documents on plannedattacks, he said.Sangaji said those arrestedwere suspected of being membersof Jemaah Anshorut Daulah,which has pledged allegiance tothe Islamic State and carried out aseries of suicide bombings.“They allegedly planned toattack churches in several placesin Merauke,” he said. He declinedto provide more details, saying theinvestigation was still under way.He said some of the suspectswere believed to have links to asuicide attack outside a RomanCatholic cathedral in March inMakassar, South Sulawesi.Arrests of suspected Islamicmilitants are rare in Papua, aformer Dutch colony in thewestern part of New Guinea that isethnically and culturally distinctfrom much of Indonesia. It wasincorporated into Indonesia in1969 after a UN-sponsored ballotthat was seen as a sham by many.In late 2019, police arrestedseven suspected militants inJayapura, the capital of Papuaprovince, who had fled from acounterterrorism crackdown onother Indonesian islands.'], + [60202106020014069, + 'Mighty model', + 'An assembled qualification model of SouthKorea’s space rocket Nuri is moved to a site atNaro Space Centre in Goheung, South Korea,to undergo tests such as fuel tank charging andlaunch pad separation. The actual flight modelof the rocket that will be used on the day isunder construction. South Korea has reserved2 trillion won (HK$14 billion) to develop its firsthome-grown space launch vehicle and hashigh hopes for an October blast-off. Photo: EPA'], + [60202106020013874, + 'School’s out for students defying junta', + 'Agence France-Presse in YangonSchools in Myanmar openedyesterday for the first time sincethe military seized power, butteachers and students were set todefy the junta’s calls for full classroomsin a show of resistance.Four months of turmoil havefollowed the February ousting ofcivilian leader Aung San Suu Kyi,with more than 800 people killedby security forces and a nationwidestrike crippling the economy,as the military fights antijuntamilitias on several frontsand struggles to impose order.Public school teachers –dressed in their green and whiteuniforms – were prominent in theearly mass protests, joining railwayworkers, doctors and civilservants on the streets.The junta insisted schoolsopen yesterday after a year’s absencebecause of Covid-19, butmany educators had already decidedthey could not return.“I’m not afraid of their arrestand torture,” said Shwe Nadi, ateacher from the commercial capitalwhose name has beenchanged for her safety. “I’m afraidof becoming a teacher who teachesthe students propaganda.”The 28-year-old was fired forsupporting the civil disobediencemovement – one of the thousandsof teachers and academics thejunta has sacked.“Of course, I feel bad losing myjob because I loved being a teacher.Although it is not well paid, wehave our pride for being teachersas others respect us,” she said.Primary schoolteacher NuMay – not her real name – insouthern Mon state said shewould also stay away. She lostmonths of her salary after joiningthe nationwide boycott, but said“my soul is pure” because shetook part in the strike.“When I see how they havekilled a lot of people, I feel I don’twant to be their teacher anymore,” she added.Some of those killed in thecrackdown were of primaryschool age and charity group Savethe Children said the dead included15 children under the age of 16.Junta-run media has beenpromising that parents would be“satisfied” with a return of classesand students at a school near thecapital Naypyidaw opened a setpiececeremony to mark the newterm by performing a “NationalEnrolment Week” song in front ofthe regime’s education minister,according to the Global New Lightof Myanmar state newspaper.But at one high school in centralSagaing region, a slogandaubed in red paint on the front ofthe building urged staff membersto stay away. “We do not want themilitary slavery teachers,”showed pictures carried by localmedia. “We do not want theteachers who are traitors.”Some university classes areback in session, but boycotts haveseen widespread absences onboth sides of the teaching lectern.“Not one of my friends isgoing,” said an English major at auniversity in Mawlamyine, a citythat saw brutal crackdowns bysecurity forces against protesters.“So I decided not to go too.”Her class of 100 is now empty,despite students being summonedby the few remaining professorson campus.Protesters have discouragedpeople from sending children toschools that still have teacherswilling to work, saying it amountedto backing the military regime.“Do not be sad when you cannotenrol your child at schoolwhen some parents have no childrento enrol,” read a banner inBago region, south of the capital.Teacher Shwe Nadi said shewas committed to the civil disobediencemovement. “I won’t runbecause I have not committedany crimes,” she said. “If theywant to arrest me, I am prepared.”Meanwhile, the junta used artilleryand helicopters on Mondayagainst militias in the country’seast, rebels said, forcing residentsto flee and join thousands ofothers displaced by fighting.Additional reporting by Reuters'], + [60202106020014244, + '网络红人开店推广加密货币', + '习目港本地网络红人陈怡,臼几年前在机缘巧合之下接触到加密货币,直至去年才全力加入投资队伍,近月更瞓身开设“。TC shop”,广邀大众一起认识及投资加密货币。\n她忆述,约在2018年初,有一位内地客人坚持用比特币(Bilcoin)付款,当时Bi.co洫价钱约6.000余美元,折合今万多港元,大概都知道Bi,coin“系乜嚟”,鉴于“唔收就做唔到生意”,最后无奈接受3侗几Bilcoin作为货款,坦言当时迎怎样卖出去都不知过.不过,经过后来慢慢了解才知道Bi,c。in是技 一回耶,于是开始囤积Bi.c。",将每个月剩余的钱u!!"入两坐笼泅加密货币B5cc05n及以人币。\n她Q,L,当时都不敢向他人提及,因为一讲就“畀人泼冷水”,加密货币仍臈真s门投资工具。白至去年,才Oq始在纲上谈及加密货币,亦00设私人群组介绍粉丝贸Bi.co",当时创椠家马斯克(E!on M1.sk)仍未斥巨资买入Bi.co沁,币osO仅约9.000美元;而当时另一只笼头加密货币以太币,仿约200美元。\n教人投资助增向上流动机会\n她指’向人推介加密货币的初衷是有感年轻人向上流的机会渺菠’在地眼,1,唯一可以翻盘的机会,就是透魍投资加密货币为财富增值,可以增加向上流的机会,更是推翻香港日前极度不健康的社会生态的方法。参考刚刚公布的新富豪榜,香港白手兴家的富豪在全球数娥最少,因为全球都在发胶科技创新,而香港仍然集中地产投资;而仿年轻富豪Sam Ban如、a,\\一hiCd,就足恐加密货币而一跃名列于富豪榜内‘\n开铺分享知识设一条龙服务\n她指,正所谓“衣食足,知荣好”,令市民财富增加’是令香港社会更加美好的元素之一。因此她立志成为推手,教人如何投资加密货币。\n由于太多网上的观众希望买入J,I!密货币增篮,因而岍生00脯摘OTC sh。D的想法,且于今年初开业,r用埋一个安全平台去教人,我唔系要炒币,我要大家稳定地赚大钱。”陈怡称’很多人以为要有足够知识才可以进入币圈投资,“其实晤需要,你需要嘅系坚定同远见’我白间都唔系样样知过得好深入,但我睇到加密货币前景。”自然就可以分一杯羹,亦有幸成为其中一个橱轮。\n她希望有生之年可以见到香港不再是投资地产独大,香港人不再为一髑楼而贡献自己一生。具店铺提供一条龙服务,由零80始教导投资者如何贸,9C加密货币,由选押平台,如何操作,乃至于兑换俗称“U仔”的USBT,都有同事从旁协助。她认为,港人对加密货币的认识不及内地人,可以发掘的市场潜力仍然很大,未来的殿可观。她解陴,其店誧只赚兑换LSBT的差侦,而汇率比其他OTC s比p更便宜。\nB5tC05n回报高大牛市开始\n陈怡56I指,加密货币的好处是去中心化。尤其是Bicc。in’敝户参与度达九成’大户对其影窖较细’极少人为操作,且!叫报率撷高,去年其刚推介时每枚值9.000美元’早前已升穿6万美元,最近亦值3万余美元,是市面上罕见的高回报产品,相信日前只是B5.coi”大牛市的开始而已。她反问,“为何仍停留在买楼投资的观念之中?”\n不建议炒卖期货与赌钱无异\n她迹议投资加密货币’可以由其较睇好的BicC(,in人手,并直指Bi,coi.、不是凭空出现,而是已有长远12年的社区共识,可以称得上是“数码黄金” 。不过,她不建议炒2QCBi,c05n期货,坦言与赌钱无异,可以愉身家。(东方日报)及东绸并不提供任何投资建议.此文章内容为客户个人意见.不代表(东方日报)及东绸立场。任何文章涉及的法律卖任与{东方日报}及东纲无关。'], + [60202106020014098, + '精准传播 推进中国故事全球化表达', + '【大公报讯】据新华社报道:中共中央政治局5月31日下午就加强中国国际传播能力建设进行第三十次集体学习。中共中央总书记习近平在主持学习时强调,要全面提升国际传播效能,建强适应新时代国际传播需要的专门人才队伍。要加强国际传播的理论研究,掌握国际传播的规律,构建对外话语体系,提高传播艺术。要采用贴近不同区域、不同国家、不同群体受众的精准传播方式,推进中国故事和中国声音的全球化表达、区域化表达、分众化表达,增强国际传播的亲和力和实效性。要广交朋友、团结和争取大多数,不断扩大知华友华的国际舆论朋友圈。要讲究舆论斗争的策略和艺术,提升重大问题对外发声能力。\n习近平强调,各级党委(党组)要把加强国际传播能力建设纳入党委(党组)意识形态工作责任制,加强组织领导,加大财政投入,帮助推动实际工作、解决具体困难。各级领导干部要主动做国际传播工作,主要负责同志既要亲自抓,也要亲自做。要加强对领导干部的国际传播知识培训,发挥各级党组织作用,形成自觉维护党和国家尊严形象的良好氛围。各级党校(行政学院)要把国际传播能力培养作为重要内容。要加强高校学科建设和后备人才培养,提升国际传播理论研究水平。'], + [60202106020458847, + '关于遭受冒犯的不妥协宣言', + '当我察觉到自己又犯下不识抬举的毛病时,难免担心再不收敛,真性情便会开罪全世界。\n别人明明是好心向我提意见,我怎么会觉得遭受冒犯呢?\n然而,我跟对方交谈不到十分钟,理论上还算是陌生人,他却对我的人生指指点点,纵使所提出的意见有其道理,纵使知道对方并无恶意,我还是无法坦然接受随便对我的人生所下的定论。\n我已经一再表明立场和想法,对方依然坚持我该照着他的方法改变生活习惯,这样不是强人所难吗?对方的自以为是让我感到深受冒犯,每个人的生活方式都属于私人领域,怎能容许一个陌生人胡乱指手划脚?\n是我自尊心太重吧?抑或是太自我中心?对方根本无意冒犯我,为甚么我不可以一笑置之?反正往后也不会再跟对方见面,有甚么好执着呢?\n有时候我执着到连自己也受不了。\n譬如说,听到恭维说话笑着接纳就行,但有时候我连听到恭维说话也觉得讨厌。\n美女当然是褒词,正常来说被唤作美女时会感到欣悦,即使对方只是茶餐厅的侍应,或是街市水果档的档主,因为知道他们称我做美女不过是职业需要,所以不会感到抗拒。但如果在公开场合,有一个对所有女人都叫美女的猥琐男人大声以"美女"称呼我,我不但感到不悦,还会有一种遭受冒犯的感觉。\n我才不要做你口中的美女,请不要用你的嘴巴呼唤我,那让我全身每一个毛孔都反感地张开来,得耗尽所有力气才能抑压住高声尖叫的冲动。\n于是我知道,我根本是对人不对事。这很讨厌,完全不识抬举,但当我遭受冒犯时,实在无法继续堆出笑脸跟你哈哈哈,我会毫不掩饰地让你知道,我不喜欢你,不喜欢你说的话。\n我原本想写忏悔文,结果写着写着变了不妥协宣言,实在罪过。\n祁淇\n逢周三见报'], + [60202106020654889, + '巴西接办美洲杯', + '【新华社里约热内卢五月三十一日电】(记者赵焱 陈威华)亚松森消息:南美洲足联三十一日宣布,在哥伦比亚和阿根廷先后放弃主办二0二一美洲杯足球赛后,该组织决定本届美洲杯将在巴西举行。\n由于阿根廷在三十日晚间宣布放弃主办美洲杯,而原定的另一个东道主哥伦比亚已于本月二十日宣布放弃主办,南美洲足联三十一日上午召开紧急会议,讨论更换比赛地点。最终会议认为巴西在二0一九年举办了上一届美洲杯,有充份的经验,并且巴西拥有多座条件良好的球场,因此将东道国更换为巴西。\n在当日会议开始前南美洲足联和巴西足协都并没有考虑让巴西再次举办这一赛事。此前递交过主办申请的国家除阿根廷和哥伦比亚外,有厄瓜多尔和委内瑞拉,而在哥伦比亚放弃主办权后,智利曾经表示过愿意与阿根廷共同主办的意愿,此外还有不少声音呼吁在美国举办,但南美洲足协表示不会考虑。\n在会议期间,与会者认为巴西有多座场馆闲置,因此在巴西举办比赛不会对巴西国内联赛产生影响,最终达成一致将二0二一美洲杯足球赛移师巴西举行。'], + [60202106020654884, + '暴雨致大树倒塌', + '【港讯】荃湾多层停车场大厦面向青山公路荃湾段对开,昨日下午在暴雨下有大树倒塌,压中两辆停泊车辆,无人受伤。\n事发在下午四时许,一棵高约十五米的大树,怀疑在暴雨天气下塌下,树根清楚可见。现场当时约有五架车辆停泊,其中两架被压中轻微损毁。消防到场用工具移走树枝。'], + [60202106020625733, + '香港过去五年诈骗案件\u3000涉款高达百卅亿元', + '【香港中通社六月一日电】香港诈骗案数字持续上升,特区政府保安局副局长区志光六月一日出席立法会保安事务委员会指出,过去五年在港发生的诈骗案涉款超过一百三十亿港元,去年更按年增九成,警方已增加人手调查。\n根据保安局向立法会提交的文件,由于疫情下的社交距离措施及在家工作安排等,市民留在家中使用互联网,以及运用社交媒体购物及交友,增加堕入各类骗案的机会,因此诈骗案大幅增加。\n区志光表示,电话、电邮骗案成本低,而且被捕风险低,仅去年香港就录得一点五万宗诈骗案,按年增加近九成,占整体罪案数字四分一。而过去五年,在港发生的诈骗案涉款已超过一百三十亿港元。\n区志光指出,警方已增加人手和资源严加打击诈骗案,例如在过去三年,反诈骗协调中心和网络安全及科技罪案调查科共增加一百二十八个职位,处理电话查询和电话止付机制的工作等。反诈骗协调中心至今及时预防了一千零一十二宗进行中骗案,截至今年三月,成功拦截二千一百七十宗个案,涉款超过七十八点五亿港元。'], + [60202106020732493, + '费达拿郑赛赛晋次轮', + '【新华社巴黎5月31日电】(记者萧亚卓)2021法国网球公开赛首轮比赛31日继续进行,当天登场的两名中国女单选手均经历了三盘苦战,但是结局却不太一样。\n此前五次在红土大满贯上止步首轮的郑赛赛以4:6、6:4、6:4,耗时两小时41分钟逆转战胜世界排名第49位的西班牙选手托莫,职业生涯首次在罗兰加洛斯收获一场女单正赛的胜利。\n而另一位中国选手朱琳同样是在4:6先丢一盘的情况下以6:4扳回一盘,但却在决胜盘中未能把握住机会,以4:6不敌28号种子、美国选手佩古拉。\n女单其他比赛中,卫冕冠军、来自波兰的斯维亚特克以6:0、7:5击败斯洛文尼亚选手尤万晋级次轮。8号种子小威廉斯、去年的亚军克宁也分别战胜各自对手晋级。\n男单方面,当天最受瞩目的无疑是时隔2年再次回到罗兰加洛斯的瑞士“天王”费达拿,以8号种子身份出战的费天王直落三盘以6:2、6:4、6:3轻取乌兹别克斯坦选手伊斯托明晋级第二轮。'], + [60202106020732511, + '澳门福建体总慰问跑手家属', + '【专访】福建体育总会昨日中午探访珠海“凯乐石跑山训练赛”时意外身亡刘先生家属,表达深切慰问及向家属送上慰问金,以助解决即时之需。\n据悉,死者刘先生是福建体育总会的会员,跑步已有数年,早前亦参加本地马拉松赛事。其家属表示,事发当天还与死者联络,因此收到发生意外的消息时感到难以置信,十分震惊和伤心,并提出多点质疑,认为悲剧是由人为过失造成,希望有关方面查出真相,还死者公道。\n家属刘先生指死因初判为高温中暑猝死,但有关方面一直未交代任何赛事件存在的各种管理遗漏和安全问题,并且由头到尾没有提出过赔偿。他称现阶段会先妥善处理好遇难者的身后事,但同时希望有关方面能查明真相,是否涉及人为过失,查出责任谁属,还遇难者一个公道。\n福建体育总会会长严耀庭表示,此名越野跑手为本会会员,发生此次意外深感悲痛;由于户外越野运动环境复杂多变,选择安全可靠的赛道、线路是基础﹔赛事组织方要在天气预报、医疗救治、应急救援方面提供全方位保障,同时也必须严格对参赛选手身体素质作要求。\n澳门福建体育总会冀有关当局必须彻查事件,并要求进一步加强赛事安全管理工作,不断完善体育领域安全风险防控制度和举措。检视这场比赛的各项安全管理工作是否达标,在人手、医疗措施、水站以及安全保障是否足够,选手失联后有否迅速启动搜救工作等,给家属一个交代。\n昨日参与探访还有福建体育总会常务副会长廖宗剑。'], + [60202106020654947, + '平遥国际电影展10月举办', + '记者从山西省政府新闻办举行的新闻发布会上获悉,第五届平遥国际电影展将于十月十二日到十九日在平遥古城举办。\n平遥电影展有限公司董事长贾樟柯介绍,经过四年的探索和实践,平遥国际电影展已经形成了相对稳定的节目策划框架和组织模式,选片、组织形式等方面均获得电影界同仁、观众和媒体的肯定与好评,因此,第五届平遥国际电影展将保持原有方向和已有特点。“我们将继续在展映世界各国优秀影片的基础上,尤为注重发现并积极推广新兴及发展中国家青年导演的优秀作品,为这些影片提供发声的平台,增强世界各国电影工作者之间的交流,以激活、繁荣世界电影的创作。”他说。'], + [60202106020732552, + '姜涛压力大发脾气', + "二十二岁“人气王”姜涛“520我爱你”于520当日在社交网站“放负”,用英文留言:“I'm tired of everything(我对所有事情感到厌倦)”,“姜糖”担心不已,经理人花姐事后极速澄清他只是身体不适,休息几天再向大家报平安。有指姜涛压力大到爆煲,而他曾承认脾气唔好,情绪差经常发脾气,去年去台湾拍剧前,因为所有事情堆在一起,令他脾气大,要同身边人道歉。姜涛自520后“潜水”十一日,昨日傍晚五点于喺社交网站“上水”。\n姜涛上载在酒店露台看海沉思的照片,若有所思,并报平安说没事了,他写道:“假期结束?没事了,抱歉让大家担心了。”其后又在限时动态大玩鬼马Filter,更开心到舔脷,又分别播歌扮唱队友Ian的《DWBF》及Edan的《E先生连环不幸事件》,心情不俗!Edan则回覆:“未见过你咁靓仔。”但滤镜似乎未能掩盖他嘴上的伤口,几段片中都见到他右边嘴唇有结疤。"], + [60202106020625761, + '《澳门人·澳门事》浮世绘版画作品展', + '江户是日本近代史上一个繁盛的朝代,世间浮华的众生相被艺术家记录下来,形成了浮世绘画风。由澳门版画研究中心举办的“丽影流光浮世绘”版画展,在东方基金会展出了六十帧江户时代浮世绘大师作品,澳视澳门今(二日)晚九时播出的《澳门人?澳门事》,将为大家介绍来自日本的浮世绘版画作品展。\n是次展览由澳门资深版画家王祯宝统筹,由日本带来了多位江户时代版画大师的作品,包括葛饰北斋、喜多川歌磨、东洲斋写乐、歌川广重、歌川国明以及豊原国明等等。王祯宝表示,木刻水印的印刷技术,大概于公元八世纪时经遣唐使带回日本,至今逾千年历史,而其独特的画风乃至用色,以及异国情调都尤为吸引,在二十世纪初叶甚至影响众多西方画家,如梵谷和莫奈等皆争相仿效。\n澳门体育、澳广视网站及澳广视App为观众直播精彩体育赛事。黄昏六时,直播日本职业足球联赛,由FC横滨对川崎前锋;晚上十时,直播二0二一世界女子排球联赛,中国对土耳其,敬请收看。'], + [60202106020732544, + '图片新闻', + '◎ 由于传统市场传出染疫风险高,台湾高雄市实施传统市场“自主防疫分流”。六月一日起结合千名人力在各传统市场入口稽查,高雄左营最大的哈啰传统市场采买民众还是很多,相关单位也在一旁举牌发传单宣导,要市民以身份证字号尾数当依据,单数每周三、五、日择一天采买,双数每周二、四、六选一天出门一次买足。图为高雄左营埤仔头传统早市在路口就设置管制站,进入须查验登记。(香港中通社图片)'], + [60202106020732503, + '巫师追至一比三', + '【中央社华盛顿31日综合外电报道】76人头号大将艾比迪伤退,巫师靠比尔攻下全场最高27分,韦斯博克拿下大三元,八村垒也获双十的联手带领下,以122比114抢下胜利,目前在系列赛仍以1比3落后。\n美国职篮NBA东区第8种子华盛顿巫师,在季后赛首轮前3战,被头号种子费城76人连拿3胜,面临再输就要被横扫出局的命运。\n今天系列赛第4战,76人在比赛开打后,曾领先达11分,巫师一阵猛追,在韦斯博克(Russell Westbrook)两罚中1后,将落后追到剩下1分差,76人靠着美顿(Shake Milton)切入上篮得手,在首节31比28领先。\n76首节虽然领先,但却出现艾比迪(Joel Embiid)伤退的消息。艾比迪在比赛开打7分多钟后的一次进攻遭巫师的卢比斯(RobinLopez)挡下,接着摔倒在地,他面露痛苦的起身持续奋战,但首节被换下后,便未在回到场上,今天缴出8分6篮板2助攻成绩。\n第2节巫师在卢比斯单节8分带领下,打出32比30攻势,76人靠格连(Danny Green)中场前0.6秒砍进三分球,上半场以61比60领先巫师1分。\n巫师下半场打出6比0开局,以66比61将比分超前,第3节结束前1分7秒,韦斯博克一记急停跳投得手,让巫师88比78领先达两位数,整节下来只让76人拿下19分,打完3节后反以92比80领先12分。\n末节76人强势反扑,并曾将比分扳平,但巫师及时回神要回领先,两队一阵拉锯后,巫师惊险以8分之差拿下胜利。\n巫师今天获胜,目前在7战4胜制的系列赛,仍以1比3落后。(译者:李晋纬)'], + [60202106020014186, '晨操精选', '贺厩‘日就月将j昨晨由巴度快试一段,火气增强,过步有力,状态保持大勇。'], + [60202106020014082, + '浸大“社会与健康研究”副学士课程多角度认识全人健康', + '一场世纪疫症令大家意识到健康的重要。根据世界卫生组织的定义,健康不仅指0没有患病,更是身体、心理、灵性及社交上的和谐及整全’即近年兴起的厂全人健康”0概念。香港浸会大学国际学院(C!E)开办的r社会与健康研究”副学士课程,从0心理、营养、运动及康乐,以至社会学等不同角度探讨r全人健康”’助学生掌握相0关的学术基础及实践理论,帮助自己及他人达至健康人生。\n自疫情开始以来’每个人都对自身健康更为着紧。从生理上预防疾病开始,进而渐渐意识到社交距离措施为我们的心理健康带来负面影响。除了个人的生理及心理健康外,各行各业都受本地公共卫生政策以及世界各地疫情的发展影响。所以我们时刻关注政府的防疫措施及社会资源的运用,期望能够尽快回复正常生活。\n谁主宰大家健康\n’社会与健康研究j副学士课程专修的业界顾问、公共卫生医学专科医生尹慧儿教授表示’社会大环境时刻影响我们的健康。安全的食水及清新的空气等固然直接影响我们的生理健康,但其他社会环境因素如经济环境及就业情况、社会保障和资源的分配,以至帮助我们舒展身心的休闲文化康体活动的多寡等,亦左右我们的心理和情绪健康。\n每个人都是促进社会大璟境因素的一分子,我们日常的护理习惯已经可以对他人的健康构成风险:无论身处哪个工作岗位都有需要思考如何把健康因素纳入我们的职责范围。由于每项决策都牵涉甚广,负责社会规划和制订公共卫生政策的专业人士更加需要具备全面的知识、周详的计划和正确的价值观。\n跨学科教授全人健康知识\n社会需要培育有使命感和专业知识的青年领袖,推动我们走进一个更健康的未来oCOE紧贴社会需求,开办.社会与健康研究j副学士课程,教授全人健康知识,并强调跨学科培训,教导学生有关运动及康乐学、营养学、心理学、社会学、哲学,以至神经科学等方面的知识,内容全面。学生可从个人以至社会和环球层面去探讨不同的健康议题。\n吞港浸会大学国际学院学术统筹主任丁安祺博士表示,课程教授实用的基础理论。在营养学方面,学生可了解到人生在不同阶段需要吸收哪些营养以保持身体健康;在心理学方面’学生可掌握基本的健康辅导技巧,在日常生活中应用,帮助有需要的人。\n此外’课程中有以身心灵健康概念为基础的科目,从神经科学、哲学及心理学等方面探讨全人健康,教授学生生理及心理之间如何互相影响,以及不同的正向思维方法。当学生认识到相关的学术研究后,便可进行资料搜集,设计一些能帮助他人培养正向思维的活动,从而提升他人的幸福感。\n课程涵盖层面广\n坊间现有的课程一般将焦点放在个人身体健康层面,而此课程设计独特,强调全人健康。除了涵盖生理及心理等范畴外,还包括精神、社会、环境,甚至灵性等方面的探讨。跨学科训练有助培训学生将不同范畴的知识融会贯通,从多角度分析及研究,进而全面理解并推广全人健康的好处及重要性。\n升学就业出路阔\n担任课程学术顾问的浸大体育、运动及健康学系刘永松教授称现代的健康理念应该是全方位与跨学科的理论与专业。此课程结构以基础学术理论为起点,及后学生可按个人兴趣及专长,选择合适范畴作进阶研修。没有修读过任何理科相关学科之同学,亦可报读此课程。毕业生的升学及就业出路亦相当多元化,有志继续升学的学生可选择衔接心理学、社会学、健康学,以及体育学相关学士学位课程,如浸大的’体育及康乐管理j文学士课程。就业方面,毕业生可选择投身健康教育、社会0\n企业、运动及康乐管理等行0\n业,例如担任社福机构行政0\n人员、项目主任、公共事0\n务机构康乐活动统筹主0\n任、教学助理、医管局0\n行政主任及政府二级联0\n络主任等,出路甚广o0'], + [60202106020014377, + '川崎前锋上盘横扫', + '周三日职联赛,榜尾的FC横滨主场与一哥川崎前锋上演天地对决。卫冕的前锋火力惊人,也是客场胜率最高一队’今仗造访包尾的横滨’大炒可期‘让球盘开出前锋让球半、两球,推介上盘’(球赛编号:星期三22-唷线662台’602台周三晚上P8:00直播)川崎前锋上周凭补时入球’终以2比1勇挫鹿岛鹿角,避过连和两场的局面’前锋赛后在榜首?离次席的名古屋鲸鱼多达15分’开季20战仍然保持不败‘球队火力猛烈,至今攻入49球,比联赛入球第2多的横滨水手还多17球’首席射手李安度达美奥累积、2个士哥,包括对上两轮赛事皆有进帐’状态火热‘射入8球的中场三笛薰与左闸旗手怜央被LJ23国家队微召,中场谷口彰晤与守将山根视来则入选大国脚,因要应付国际赛而同告缺阵’但客军仍有贡献7球’曾外流西甲的34岁翼锋家长昭博,以及有6个士哥的老将小林悠等好手’其中后者每次后备上阵皆有表现,包括刚仗射入绝杀球’曰职E申射手本色不减当年。即使前锋阵容不整,人脚仍然比主队优胜.前锋今季7场作客5胜2和’胜率逾七成’正是曰职最高’值得看好‘\nFC横滨上轮作客‘’比2负大阪飞脚’在护级竞争对手身上全失3分’对士气打击甚大。横滨在刚仗落败前,对上4场联赛l胜2和1负’算是季内最好走势’如今气势被打断’难望爆冶。连杯赛在内’他们过去3场主场l胜2和’表演尚可,但至今仍是联赛主场成绩最差一队’地头优势薄弱。加上本身对垒前列分子例无运行,季内斗二哥名古屋鲸鱼输,,比3,亦曾被三哥树宾水手狂数s比‘’,而且属于曰职失球最多一队,周三遇上火力全开的一哥’输少当赢了。\n前锋过去6决斗树宾取得全胜纪录,包括上季作客大胜5比1’今场匕盘稳开’波胆估客队赢4比‘,完场、、'], + [60202106020014359, + '宝山鹰一网打尽', + '周曰田战,方厩赢得3w’追过蔡约翰’\n暂启揩首’正所谓鲫白’今次谷草7友赛’\n方嘉柏继续有实力马列阵’食住个势’可添\n头马进帐。\n三场四班P200米。’宝山鹰j居四班恶气十足’用回赢马拍当史卓丰,有力一网打尽..喜骏之星j自改战谷草后,表现大跃进’5战同程取得2冠’亚、季,上名率高达80%,今次用上潘顿,决心打硬仗。’各取所需j季尾仍有勇态,在港未交代之马,必有斗志’值得捧场。.正本雄心j后追马遇同场多快马,步速会有利争胜’直路取位顺利’有权后上打打入前列”\n四场四班7650米。’捉金皇j季内两捷同程’仍留在四班作赛’当然着数,今次用陈嘉熙减磅’鞍上人发挥得宜,足可与强敌一较高下。.梦幻战士j虽然排十一档难跑,胜在本身近况正勇,可在三甲占一席位c.曰就月将j越跑越进步’今次转场而临,有博你下信之感’随时可在此打开在港胜门。’真感j初出谷草]2‘,‘’米上名’窄场必定可应付’半冶可敲。\n五场四班P200米o.骏爵士j三四班升降机’居四班用陈嘉熙减磅出战’只要跑出水准,当然可以制胜’’腾龙超影j老马季尾有态,不会保留’斗志必坚,”蒲侠超得j复课无不妥之处,马雅骑必有分头,半冶可以一博、、.有运来j上场同程排十一档入Q’走势出色,今场排六档有利可图’有力补中。全胜'], + [60202106020014373, + '蒲侠超得开胜门', + '今年华?的成绩不俗’暂时骑师榜排头\n十位的占五席’下过黄俊的际遇却麻麻’\n犹记得于]8至]9马季,此子以38场头马\n完结赛季,前景美好,但想不到今季到目一一前还未开斋.五场四班.1200米。’蒲侠超得j已摆脱伤患困扰’试闸L火冒升,本身质案超班’有望打开脚弓。.魅力星光j胜后操练有增无减’佳态保持’今场配搭不变再战同程’可追多一次、..腾龙超影j难得老马仍然有火有力,兼目体力充沛’强配出击’斗志无疑’有力争入三甲。.富存英雄j]200米乃首本路程,人马合拍下,同属前列分子。甜蜜蜜'], + [60202106020625610, + '杭州飞轮潜力强', + '八场三班.1200米。.杭州飞\n轮j在港初出泥地1200米大败\n后,前仗转争谷草1200米’易\n配田泰安即露真面目’大胜亚\n军马;仍然赢j多达两个半马位,走势惊人’必属奸货色,人马上仗升班再跑同程’排九档形势欠佳’还能力追第三’表现持续进步,质新马复课状态更佳,今次’只。负114磅助力极大’转抽六档有利,马胆之选。\n.金鹰翱翔j季内同程开斋后,再出了仗得2季l殿,表现下进反退’幕后拉仓转投吕厩’今次转厩复出,实有瞳憬价值。.声势j上场四班同程险胜’今次升班虽遇强手’但本身大减14磅,形势有利,可加入战团。.齐齐友福j季内6战2冠2季,正是郑厩招牌马’当锐之辈升班再战下落下风,奈何排十一档’还是作配‘'], + [60202106020014177, + '史卓丰丁冠豪密港', + '史卓丰、丁冠豪今战有’宝山鹰j这匹合作马,两人昨晨在机坪相遇随即密倾,相信话题应是落在该驹身上。’宝山鹰j在港唯一胜仗正由史卓丰经手,今次故剑重逢,有动态支持下,当有捧场价值。'], + [60202106020014365, + '精彩辉煌回马枪', + '四场四班P650米o’帚豪大师j季内\n9战同程得l冠3亚2季,上名率逾五成’\n季初赢马一役姿态甚为轻松’前两仗分别\n输给’好运宝宝j及.超好日子j俱短马\n头位居亚,评分仍占优,潘顿锲而下舍’边线马再战强项’可施回马枪添食‘\n.梦幻战士j近4仗同程得2亚2季,其中3仗及1月同程建功俱由田泰安包办,人马合拍,上仗改配蔡明绍得第三。只负头马.劲驹j半个身位,表现亦佳,今仗回配田泰安,争胜要角。.真感j近两仗田草1嗽,c’米得l季’该役后上追势劲,今仗多跑2so米首战同程应更合发挥’三甲之材。.曰就月将j及.捉金皇j半冶可敲‘\n八场三班P200米o.精彩辉煌j今季两度上阵前仗同程得亚军,沿栏疾走颈位之微输给.善传万里j’上仗改争田草直路赛无功,不足为据,前季两胜班内同程’回气再战食胡路程,有力一网打尽’\n.超威星j今季专攻同程8战得2冠2亚,表现甚佳’去年]2月先在四班赢马,接战升班再出取得连捷,近3仗得2亚,分别输给.疾风劲草j及’上骏之星j’班利分子争胜要角。‘声势j上赛首战谷草同程一击即中’蔡约翰作风用尽一时,升班少负]4磅,此消彼长下’机会犹在。.杭州飞轮j及’颜色大师j冶脚材‘奥利华'], + [60202106020654805, + '电讯飞弹省招牌', + '方嘉柏马房刚战连中三元,目前以66W\n压倒65W的蔡厩,重返榜首,足见仓主\n争冠之洳L、’今次谷战,方厩派出了驹’\n阵容依然鼎盛’雄仔一于捧场。\n.电讯飞弹j在港12战只得1亚’始终未赢’今季初出五班田草1200米与‘友盈有款j串Q’可惜继后越跑越差’再出6战全败,幕后拉仓转投方厩训练,投入新环境之后,r电讯飞弹j由莫雷拉连试两课大闸’可谓做足备战工夫,今次转厩初出跑五班谷草l(100米’雷神押阵’有权即食省招牌。\n’快步速j堪称马房铁马之一’季内15次上阵取得2冠2亚5季’成绩甚佳’近仗于三班依然跑近’今次重返四班当然占优’转用手风正顺的何泽尧操刀,排二档出战贴栏而驰’大可拚入前列‘\n.胜贤j近3战田草]4(10米取得2冠]亚,马房省觊招牌’亡仗?马姿态之劲’实在难忘’直略逢马过马,完全视对手如无物,质素重新估计,三班超班无疑’今次转争谷草一哩’薛恩再骑人马合拍,有权乘胜追击。雄仔'], + [60202106020625612, + '?龙超影零距离一定美丽', + '今晚谷草9场赛\n事,采用A跑道作\n赛,六宝奖多逵积\n至290万元,预计\n派彩达到了00万元。巴度骑足9场,胯下过半数属争胜分子,有权连场入围,骑师王博得过。另外,潘明辉、郭能同样有实力马,亦须留意。马房方面,方、蔡及沈厩同样布下强阵而来,当中还是以方厩最劲,可望延续胜势,值得优先捧场。成功\n一场五班1000米o.劲飞圣j同程2冠4亚3季4殿,排一档形势更佳,回配赢马拍档巴度’有权一放到底。’椰子糖j上场五班直路赛跑第四’状态渐入佳境,今次转跑谷草同程’郭能躁刀有斗志,可予前驹莫大威胁。.常胜心j至今未?表现失色,不过同程始终是最佳’\n累积3亚2季1殿’配潘顿O[二档’必拚无\n疑。.电讯飞弹j转投方厩做足工夫\n始告上阵,战意已是值回票价”\n二塌四班1000米。.威妙星j\n上场同程落飞第二’未竟全\n功’今次再来用潘顿’喜排\n2一档如虎添翼,有权放返出\n;头。’附L.好友j三四班升降机,回师四班威力重现,何况还有雷神之助’必属争胜之材。.欢乐好友j上场同程第二’输在直路取位欠顺,输得不值’今次改用巴度再跑’要补中头马c‘友谊至佳j在港初出同程跑第四’实力居此够争,今排二档有助跟前跑’半冶佳选‘\n三场四班7200米o’美丽欢呼j上场转跑泥地’200米,大热侄以土下石丌L准’今蠏谷草]200米’本身续见进步,再追一次。.鼓浪顺风j四班有威力,转用田泰安,定可迫出最强战斗力’争胜分子。.喜骏之星j上周三同程力拚第三,文厩马气势正佳,今回相隔一周再出,用尽不留。’各取所需j在港2l战得3亚5季4殿,正宗易位瞓’宜佣己。\n四场四班1650米o.曰就月将j近3战亚季殿各一,已在作战水准’巴度埋门亲试两课,人马今次转争谷草一哩,相信幕险瞳旨把握,有权开胡。.真感j前仗田草1400米跑得第三,头二马为’劲力十足j、.超好日子j两匹再出再赢’赛绩毋须多说’四班极具分量,郭能押阵’全力出击。.捉金皇j今季8战2冠l亚1季2殿,方厩有冲冠决心’必拚格。.雷公凿j上场同程第三’状态、评分到位’黎海荣难得有实力马支持,定会用心跑奸‘\n五场四班1200米。r腾龙超影j老马回光大勇’季内11战同程有1冠3亚4季,表现稳定’今次落班即用赢马拍档潘顿’排四档好跑’坐二望一之选‘r魅力星光j上场同程击败’开心有利j’打开胜门,下排除开窍后威力大增,不容轻视。.骏爵士j上场排十二档被迫留到最后,最终劲追第五’今次再来排五档,早?可跟前,势可入位。.富存英雄j排十一档难跑,不过始终是新胜马,近绩可取’对手有失,可取而代之.\n六场四班.1800米。’有视力j上场谷草2000米与再出再赢的‘大有心得j’单打独斗,影相仅败输运无话可说,今次缩程跑1800米一样合脚法’巴度再骑,可收复失地、、r.?步速j三班都够争’如今重返四班当然占优,只是季内硬仗连场’担心体力欠强,热门宜配。’红衣震撼j上场田草1800米爆23。‘、7最?末段追第三,试准长力,只负115磅正是有利争胜条件、1.彩虹之光j尺寸马讲留放’今配潘顿深庆得人,全力求赢:\n七场三班1200米。.零距离j近3战同程2冠1季,上场升班再跑仍能放入第三’表现胜预期’质新马复课更进一步,今次再来’有力冲击第3W。.战熊三千j近9战同程3冠1亚3季,上场小休四个月复出,爆22。了s最?末段,走势出色’当属主要争胜马。.维港奔流j前仗同裎击败.资本家j’匕场再来都有第四,状态甚佳,今排十档料有分头,半冶博得过.’曰曰?j埋门试谷闸追第二’状态重新回起’只负¨5磅妤跑’难轻视’\n九场三班1650米or一定美丽j上场田草一哩赢来点到即止’潜力高强’转场再跑’潘顿再骑合拍’可视作蔡厩变倍器”’胜贤j上场田草1400米’转弯发力即以无敞姿态抡元.转投方厩的确> Why buildings wobble A14\n350\nThe SEG Plaza is over this many metres tall and has more than 70 floors, making it the tallest electronics shopping centre in Shenzhen'], + [60202106020360543, + '苹论:三孩政策揭示“未富先衰”败象已呈', + '中共党中央的核心一锤定音“优化”生育政策,准生子女数目从两名提升五成至三名,大手笔也。不管是“优化”也好,是“完善”也好,老百姓看来可不领情,寻且指斥新政策已失时效:当今百物腾贵,楼价尤甚;生孩子管养管教,使费高昂、教人疲累,一个已然超标,遑论三个。年轻青小资,即使不是奉行“不想跪着、不能站着”的“躺平主义”,亦宁可养宠物而不要孩子。新政策有否“催生”之效,毋须细表。然而生育政策何以非“优化”不可?\n生力军缩水 经济后劲不继\n“催生”政策紧接新近公布的人口普查结果而来,而其结果显示中国的人口结构大事不妙。人口总数虽则没有如事前所传下跌了,人口增长可持续四年下降;去年的新生人口只有1,200万,较2016年的1,786万少了三分一。生力军队伍大缩水,经济后劲不继,是可断言。那又焉能不急急“优化”生育政策以为补救?\n除了削弱经济干劲,新生补充人口少了将令人口迅速老化。从2010年至2020年这十年间,60岁以上的人口比重从13.3%剧增四成多至18.7%。影响所及,同期15岁至59岁的劳动人口比重则从70.1%给扯低一成至63.35%。经济增长后劲不继而需要供养的退休人口上升,一消一长,此将形成何等社会压力,显而易见。不妥为应对天晓得会否引发危及社会稳定的状况。在稳定压倒一切的极权国家,那是震耳欲聋的警号。人口结构何以有此剧变?\n其来有自。1979年开始雷厉风行的一孩政策,标榜“打下来!堕下来、流出来!就是不准生下来!”此史无前例、灭绝人性政策的苦果,除了炮制出一两代横蛮霸道的小皇帝,给伦理道德带来无从衡量却影响深远的祸害,且重男轻女、扭曲性别比例,进一步打击生育率,种下人口下跌的肇因。\n五年前,中共体察到长此下去中国势将“未富先衰”,由是放宽一孩政策至准生两名子女,以期刺激生育。众所周知,妇女的平均生育率起码要达致2.1个方能维持人口稳定;否则诸如夭折、危疾、不育等因素终将扯低人口总数,削减劳动人口,拖慢经济发展。故此两孩政策看似宽大却无以扭转人口下降、结构急剧老化之败象已呈。\n政权无视老百姓自由意志\n换言之,五年前的两孩政策已然堕后于形势,观乎老百姓的反应,现今的三孩政策同样于事无补。专权独裁者满以为无论是经济发展以至老百姓的生育皆在其掌控之中,而一切能随心所欲调校自如。不管是“躺平”还是削减对抗,他们安知老百姓都有自己的主意?他们何时方能从其黄粱大梦苏醒过来还老百姓以尊严与自由?\n哪怕“未富先衰”败象已呈,执权者可坚决埋首沙堆、自我麻醉。习近平在2018年12月25日至26日主持了中共中央政治局的“民主生活会议”,会议公告强调“习近平总书记在领导新时代党和国家事业发展中,在审视和把握日益错综复杂的国内外发展大势中,在带领全党全国各族人民奋进新时代的伟大实践中,战略判断高瞻远瞩,政治领导娴熟高超,人民立场鲜明坚定,历史担当强烈坚定,充份证明不愧为党中央的核心、全党的核心。”\n党中央的核心本领果是如斯高超,一孩政策、两孩政策、三孩政策一一泡汤,莫非此之为“战略判断高瞻远瞩”?埋葬极权何须暴力对抗?灭绝人性的生育政策接连崩坏则又证明,夜郎自大、无知妄想,极权统治的最大敌人从来都是一而再、再而三搬起石头砸自己脚的独裁者。\n古立\n周一至周六刊出'], + [60202106020482207, + 'Police tell firm Lai can no longer use his voting rights', + 'Next Digital seeks clarification on issue after jailed tycoon has assets frozen by authorities\nMedia\nJailed media tycoon Jimmy Lai Chee-ying can no longer exercise his voting rights at Next Digital, where he holds over 71 per cent of outstanding shares, the police force\'s national security arm has told the company.\nThe media group said yesterday that the National Security Department had sent it a letter a day earlier "confirming that [Lai] must not directly or indirectly exercise voting rights in relation to any shares in the company held by him except under the authority of a licence granted by the secretary for security".\nThe company said the ban "was not expected to have any impact on [its] operational and financial performance" as Lai was no longer a member of the board.\nNext Digital had requested clarification on the issue after Lai\'s assets, including his stake in the company he founded, were frozen by authorities last month.\nAccording to Next Digital, Lai owns about 1.88 billion shares of the media group, or 71.26 per cent of the total.\n"Holders of remaining shares in the company representing approximately 28.74 per cent of the total issued shares … may continue to exercise their respective voting rights at general meetings of the company unless otherwise required under the listing rules to abstain from voting," Next Digital said.\n"As such, the company expects its general meetings and resolutions to continue to be properly convened and voted on in accordance with the articles of association of the company and applicable rules and laws of Hong Kong."\nLai has been accused of collusion with a foreign country under the Beijing-imposed national security law, and was charged in April with two counts of conspiracy. Last week, he was sentenced to another 14 months in prison for organising an illegal rally on National Day in 2019.\nSix months of that punishment will run consecutively with a 14-month term tied to his involvement in two other illegal protests, extending his prison stay to 20 months.\nLast month, the Security Bureau froze nearly HK$500 million of Lai\'s assets, including his stake in Next Digital, which publishes the Apple Daily newspaper, as well as the local bank accounts of other firms he owns.\nThe move marked the first time authorities had invoked their new powers to freeze the assets of a listed company they believe could be related to the commission of a national security crime.\nHong Kong\'s security minister, John Lee Ka-chiu, has also warned Lai\'s bankers that dealing with his frozen accounts could land them in prison for up to seven years.\nTrading in Next Digital shares was suspended for 10 days from May 17, after Lai\'s assets were frozen.\nIn a company statement last week, the board said it did not "expect the [move] to have an immediate negative effect on the financial situation or operations of the group" and that it had enough money to run for at least 18 months from April without further funding from Lai.'], + [60202106020482267, + 'Restaurants and bars hot trend in soft office market', + 'Landlords expand entertainment options to attract younger diners while retaining tenants\nCommercial\nHong Kong property developers and landlords are welcoming more restaurants and bars to their office buildings, as they look to offer more entertainment options to their tenants in an otherwise soft office market.\nMore than five new food and beverage (F&B) outlets are set to open in and around Taikoo Place this year, for instance. These include Australian seafood restaurant Catch, boutique cafe HAVN, Lady M, which is noted for its cakes, and Japanese hand-rolled sushi and sake bar TMK.\n"Taikoo Place needs to appeal to younger members of the world … we need to make sure that it has all the amenities and restaurants and bars, and I think we have achieved that," said Don Taylor, the director of office at Swire Properties. The developer owns and operates Taikoo Place and is a major landlord in Quarry Bay.\nMost developers are putting more effort into expanding their food offerings to better serve and retain tenants amid a rise in vacant spaces in their office buildings. The vacancy rate for grade A offices rose to 7.9 per cent in Central in April, 12.4 per cent in Wan Chai and 7.5 per cent in Causeway Bay, according to Knight Frank.\n"Office tenants nowadays are, in general, more demanding, while landlords would like to enhance the image of their building and provide extra amenities to tenants," said Oliver Tong, head of retail at JLL in Hong Kong.\nSwire Properties, which recently launched Two Taikoo Place, could go a step further and improve food offerings across the whole of Taikoo Place, Taylor said. "We will put more higher-end dining in place to cater to the demand of executives of our corporate tenants, who can entertain their clients here. We are currently looking at a number of different spaces within Taikoo Place and are speaking to a number of different operators," he added.\nHysan Development, the biggest landlord in Causeway Bay, will feature alfresco dining among other options in a new grade A office tower it hopes to unveil in the district by 2026-27.\n"We see a growing demand for lifestyle spaces among our office tenants," said Ricky Lui, Hysan\'s chief operating officer.\nLast summer, landlord and developer Hongkong Land launched Basehall, a food court with nine stalls run by some of the city\'s trendiest food operators, including Honbo, Co Thanh and Young Master Brewery, in the basement of Jardine House. The company, which owns about 450,000 square metres of prime office and retail properties in Central, will offer more "innovative and trendy" dining places in the next six to 12 months, it said.\n"We are seeing F&B elements not just in the basements and lobbies, but also on rooftops, floors with balconies … We have had some discussions with various landlords on this," said Ada Fung, head of advisory and transaction services, office services, at CBRE.\n"F&B operators are always up for unique locations. So rooftops and balcony floors are always interesting options for them," she added.\n7.9%\nThe vacancy rate in Central for grade A office space in April, while in Wan Chai it was 12.4 per cent and in Causeway Bay 7.5 per cent'], + [60202106020482055, + "Ideology 'a losing battle' with U.S.", + 'Focus must instead be on technological and economic competition with West, noted reformer says\nU.S.-China tensions\nChina should play down its ideological differences with the West and instead highlight technological and economic competition, a pro-reform figure has urged.\nBeijing is facing growing pressure from Washington and its allies over trade, investment and technology.\n"Now is not the time when [China] needs to compete with the West in the field of ideology," said Li Ruogu, a former chairman of the Export-Import Bank of China.\n"Our main issue is still the development [of the economy and society], without which there will be so many problems we cannot resolve," Li added on Saturday at the second Qujiang Forum in Xian, Shaanxi province.\nTensions with the United States had led to an inevitable trend of China being cut off from the global supply chain, Li said, although in the end, overseas suppliers would still find ways to sell their products to the country\'s attractive consumer markets.\nXu Qiyuan, the director of the research department at the China Finance 40 Forum and a research fellow at the Chinese Academy of Social Science, said Beijing could learn from Washington, which ensured the security of key parts of its global supply chain by depending on allies for most imported parts and components.\n"On the one hand, we need to have trump cards in technology, having the ability to fight back; on the other hand, in politics, we should not only avoid being isolated but also need to unite more countries to stand together," Xu said.\nThe Qujiang Forum, which was organised by the China Finance 40 Forum think tank, also said in a report that after considering the deterioration of political relations and the artificial disruption of technology supplies, the risks facing China\'s global supply chain could rise sharply, while those facing the US were unlikely to change significantly.\nIt also urged caution over US President Joe Biden\'s moves to leverage allies and multilateral platforms to contain China in major technologies, even as Beijing\'s diplomatic room for manoeuvre dwindled\nAddressing the country\'s top scientists, engineers and researchers last Friday, President Xi Jinping said China must make breakthroughs in areas such as artificial intelligence, semiconductors, quantum technology, life sciences and energy.\n"Competition over cutting-edge technology has intensified to an unprecedented level. We must have a strong sense of urgency and be fully prepared," Xi said.\nHis speech coincided with a similar call from Biden, who on the same day asked the US Congress to back an ambitious funding increase for scientific agencies.\nLast week, Kurt Campbell, the Asia-Pacific policy director for the US, said the "dominant paradigm" with China would now be one of competition, as the period of engagement with China had "come to an end".\nThe odds that there would be two parallel systems for digital technology were also rising as the US-China row intensified, the Qujiang Forum report added.\nUnder current conditions, the Biden administration was unlikely to cancel trade tariffs on Chinese products in the short term, but the two countries could seek to exempt more items from the levies, the report said.\n"The [US trade representative] could use tariff exclusion measures to avoid Congressional resistance and domestic political pressure," the report said.\nYu Yongding, a senior fellow at the Chinese Academy of Social Sciences, said at the forum on Saturday that the nation had to make a judgment, on whether it would eventually be forced to decouple from the US in the hi-tech sector. Washington\'s current actions were designed to minimise the costs of such a final decoupling, he warned.\n"This is very important for both the government and enterprises … whether to throw away illusions and prepare for battle or continue to embrace hope and strive for better results," Yu said.\n"Any misjudgment will cause huge losses."'], + [60202106020482056, + "Disgraced ex-PM leads critics as nation starts 'total lockdown'", + 'Najib Razak\'s attack on government\'s handling of coronavirus pandemic strikes chord with citizens\nMalaysia\nAs the coronavirus situation deteriorates in Malaysia, brickbats are coming thick and fast for Prime Minister Muhyiddin Yassin - not just from the opposition, but from ostensibly friendly forces as well.\nFormer prime minister Najib Razak - voted out of office in 2018 and facing jail time for convictions linking him to the multibillion-dollar 1MDB financial scandal - has emerged as a complainer-in-chief of sorts against the current administration.\nA third wave of the Covid-19 pandemic that stretches back to September has snowballed at a startling pace in recent weeks, with consecutive days of record new cases - including a tally of 9,020 on Saturday. The figure dropped to 6,999 and 6,824 on Sunday and Monday respectively ahead of the two-week "total lockdown" that began yesterday.\nThe government announced the measure - which shuts most businesses and bans dining in as well as social gatherings during the lockdown period - amid signs that the health care system was close to being overwhelmed.\nOpposition leaders such as Anwar Ibrahim and Lim Guan Eng have assailed Muhyiddin for the authorities\' seemingly haphazard manner of announcing fresh restrictions.\nThe toughest jabs, however, have come from Najib, whose United Malays National Organisation (Umno) is a key cog of the ruling administration.\nIn his latest salvo on Monday, he suggested the government was taking the side of multinational companies as it allowed certain manufacturing sectors to continue operating during the 14-day lockdown. He claimed the government was allowing this on the basis that it was in the interest of keeping global economic supply chains intact.\n"Who will uphold justice for millions of ordinary citizens and small traders who will make sacrifices during this full lockdown while large companies owned by MNCs [multinational corporations] and wealthy corporations continue making profits without disruption?" he asked.\n"The Movement Control Order was already half-baked, don\'t let the total lockdown become a tiga suku [three-quarters] lockdown," he wrote on Facebook, referring to the restrictions that were in place before the government announced the tougher measures on Friday.\nPolitical analysts said such commentary had by and large struck a chord with citizens increasingly despondent over the country\'s fraught politics, amid a perception that Muhyiddin\'s administration lacks the competence to see Malaysia out of the current crisis.\nHis Perikatan Nasional alliance came to power in March last year after a complicated power struggle that displaced the then ruling Pakatan Harapan coalition.\nMuhyiddin - who set up the self-coup that handed him power - this January obtained royal assent for an eight-month state of national emergency that grants him powers to govern by fiat.\nParliament is suspended, and despite pleas from the opposition for the legislature to sit - MPs have largely been vaccinated - the government has insisted the status quo will remain until at least August.\nPolitical observer Pauline Leong said a common perception was that Najib, prime minister from 2009 until 2018, was functioning better as "opposition" compared with his time in power.\nJames Chin, a Malaysia watcher at Australia\'s University of Tasmania, said the former premier was accurately echoing resentment on the ground. However, Chin opined that the objective of the criticism was to signal that Najib, not Muhyiddin, was the more capable crisis-time leader in the government camp.\n"I think the longer Najib is out there, and the bigger the mess the government makes on [lockdown-related policies], it will make him look better and better," Chin said.\nDon\'t let the total lockdown become a [three-quarters] lockdown\nNajib Razak'], + [60202106020482046, + 'Apple keen on Chinese suppliers despite tensions', + 'Nearly one-third of the US tech giant\'s newly shortlisted vendors come from the mainland\nTechnology\nApple has added more suppliers from mainland China than anywhere else to its list of vendors over the past three years, defying worsening ties between Washington and Beijing during the Trump administration, talk of economic decoupling and increasing scrutiny of its component producers.\nNearly one-third of the newly shortlisted companies are from the mainland, according to a Post analysis of Apple\'s supplier list for 2017 and 2020.\nAmong the 52 new names added to the latest list, 15 are from the country, with several based in the tech hub of Shenzhen and others from Jiangsu province.\nSuppliers from the United States and Taiwan ranked second in terms of numbers, each with seven new shortlisted companies.\nThe increase in the number of mainland companies approved as Apple suppliers highlights the country\'s importance in global hi-tech supply chains, especially after it was able to contain the coronavirus and reopen its domestic economy.\nThe Post reported earlier that the Zhengzhou factory of Taiwan-based Foxconn Technology Group, Apple\'s biggest subcontract assembler of iPhones with a workforce of a quarter of a million, had been offering cash rewards to attract new workers to cope with busy production.\nThat marked a sharp contrast with Foxconn\'s iPhone factory in India, which had to cut output by 50 per cent because of Covid-19 infections among workers there, Reuters reported last month.\n"China\'s mature manufacturing industry still has its attractiveness despite the US-China tensions," said Will Wong, a Singapore-based analyst at research firm IDC.\n"Nevertheless, it doesn\'t mean Apple will stay away from diversifying its supply chain and production. This is especially true given that the political tensions and supply-chain disruptions caused by the pandemic have taught industry players not to put all their eggs in one basket."\nThe 200 companies on Apple\'s 2020 supplier list account for 98 per cent of the company\'s direct spend for materials, manufacturing and assembly of products worldwide. Nearly 80 per cent of these suppliers have at least one production site on the mainland.\nAlthough the list does not provide the monetary value of third-party services, mainland-based suppliers generally offer lower value-added manufacturing and materials while products and services with higher profit margins, including semiconductors and advanced components, are controlled by US and Taiwanese suppliers. Apple still relies heavily on US suppliers such as industrial conglomerate 3M, as well as chip companies Intel and Skyworks Solutions.\nMainland companies on Apple\'s list include Shenzhen Everwin Precision Technology, a 20-year-old intelligent component maker, Tianma Microelectronics, a maker of liquid-crystal displays, flash memory firm GigaDevice and Nanping Aluminium, a metal manufacturer in Fujian province.\nGiven Apple\'s dependence on the mainland as both a supply-chain partner and a key market for its products, chief executive Tim Cook has actively cultivated government and business ties during his visits to the country. For example, he is the chairman of the advisory board of the Tsinghua University School of Economics and Management, a position that grants access to Chinese leaders, including President Xi Jinping, who is a Tsinghua alumnus.\nChina\'s mature manufacturing industry still has its attractiveness despite the US-China tensions\nWill Wong, analyst'], + [60202106020373871, + '澳洲女子垒球队奥运代表团抵达日本', + '【本报记者报道】东京奥运将于下月举行,澳洲女子垒球队的奥运代表团成员抵达日本,参加奥运前的训练营,是首批到日本准备参加东京奥运的运动员。垒球队将在东京附近的大田区进行为期47天的训练营,当地距离东京大约80公里。大田区一名官员表示,代表团所有成员已经接种新型冠状病毒疫苗,他们在逗留期间,将每天接受病毒检测。\n日本神户市政府宣布,在市内确认日本境内未曾发现过新冠变异病毒株,是由英国变异病毒株变异而来,神户市政府目前认为传染力和重症化风险等特征未变。此外,共同社报道,日本内阁官房长官加藤胜信在记者会上,宣布扩大为公民接种疫苗的措施。据报在工作地点和大学提供的疫苗是美国莫德纳疫苗。\n首批抵日本东奥海外运动员\n日本政府计划由本月21日开始,在工作场所及大学校园推展接种新冠病毒疫苗,以加快当地的疫苗接种进度。日本现时每日为包括医务人员在内民众,每日接种大约50万剂次疫苗。前日公布的统计数据显示,至少已接种一剂的老年人达466万人,占对象人口的13.1%,即是每八人有一人接种。\n日本内阁官房长官加藤胜信表示,在不影响为长者接种的前提下,企业可由配合的产业医生为员工与家属接种,大学则可在校园内为学生与教职员打针,若地方政府提早完成长者接种计划,亦可提早为企业及大学人员接种。\n另外,日本政府昨日在内阁会议上,敲定加快开发与生产国产疫苗的战略,包括完善研发据点、加快批准药物使用、完善战略分配研究资金机制,和探讨中央收购疫苗,以确保疫苗供应不受其他国家情况影响,以及应对变异病毒株。首相菅义伟表示,开发生产国产疫苗和确立接种制度,对危机管理而言极为重要。\n####\n澳洲女子垒球队奥运代表团成员抵达日本。'], + [60202106020051606, + '47名泛民初选案 吴敏儿撤保释申请', + '【本报法庭组报道】47名发起或参与去年民主派“35+”初选人士,涉嫌违反《港区国安法》,被控“串谋颠覆国家政权罪”,其中11名早前被收押的被告再申请保释。国案法指定法官、总裁判官苏惠德昨处理其中八宗申请,最终七人被拒保释,被告吴敏儿则于庭上撤回保释申请,并放弃每八日覆核的权利。苏官今天将处理余下两人,谭凯邦及陈志全的保释申请。\n昨申请的被告包括有锺锦麟、吴政亨、王百羽、赵家贤、刘泽锋、范国威、余慧明及吴敏儿。苏官听完辩方陈词后,即时拒绝前七名被告的保释申请。已还押三个月的吴敏儿明显消瘦、状甚憔悴及苍老,她知悉前七名被告的由请被拒,当庭撤回申请。\n8人申保释7人被拒\n47名被告同被控一项串谋颠覆国家政权罪,指他们于2020年7月1日至2021年1月7日期间,串谋他人旨在颠覆国家政权。47人当中,现时杨雪盈、刘伟聪、吕智恒、林景楠、黄碧云、郑达鸿、彭卓棋、柯耀林、何启明、施德来、李予信共11人获准保释。\n另外,五名涉47人初选案的前立法会议员,就他们所涉的另三宗立法会冲突案再提讯,其中林卓廷、尹兆坚、郭家麒、区诺轩及胡志伟,昨于西九龙裁判法院要求撤销保释申请获准,而他们所涉的三案将押后至10月4日再讯。\n####\n被告吴敏儿撤回保释申请。(FB图片)'], + [60202106020482287, + 'WHO SAID IT', + "Who will uphold justice for ordinary citizens and small traders who will make sacrifices during this full lockdown?\nformer Malaysian prime minister Najib Razak, suggesting the government is favouring multinational companies during a 14-day lockdown\n> Asia A8\nI have to make a quality brand as opposed to other brands that just hire an ambassador who doesn't even use the products\nVirithipa Pakdeeprasong, a top Thai actress, on promoting her new skincare brand\n> Life B10\nI think now the best thing for the tournament, the other players and my well-being is that I withdraw\nTENNIS STAR NAOMI OSAKA, AFTER PULLING OUT OF THE FRENCH OPEN, SAYING SHE has been SUFFERING FROM DEPRESSION and anxiety\n> SPORT B12"], + [60202106020386423, + '5泛民控扰乱立会 4撤保释申请续还押', + '【明报专讯】因民主派“35+初选”而被控串谋颠覆国家政权罪的前立法会议员林卓廷、尹兆坚、区诺轩、胡志伟及郭家麒,涉于2018至2020年间在立法会扰乱会议秩序、妨碍立法会人员等,3案昨在西九龙裁判法院再讯。除区诺轩外,其余4人申请撤销保释获批,须继续还押,案件应辩方要求押后至10月4日再讯,以待终审法院就梁国雄抢文件上诉案作出裁决。\n5人陆续步出囚室,尹兆坚明显消瘦,胡志伟更是面容憔悴、面颊凹陷,旁听席见状随即传出“瘦咗好多……”,多人落泪,同案被告张超雄及黄碧云亦有到庭支持。\n3案发生于2018年6月13日立法会审议高铁一地两检条例草案、2019年5月11日的《逃犯条例》修订法案委员会会议、以及去年5月8日的立法会内务委员会时,5人分别被指与朱凯廸、范国威、梁耀忠、陈志全、许智峯、黄碧云、张超雄及郭永健妨碍正在执行职责的立法会人员等。\n【案件编号:ESCC2993/18&2514/29、WKCC3842/20】'], + [60202106020386437, + '永义3亿统一红磡漆咸道北旧楼业权', + '【明报专讯】继今年初统一坚尼地城吉席街旧楼业权后,永义国际(1218)昨日亦统一红磡漆咸道北472号、474号、476号及478号的业权。上述项目昨早进行强制拍卖,永义在无对手下,以底价3亿元统一有关旧楼业权。资料显示,永义早前已持有漆咸道北470号已包括全数100%业权,将连同上述漆咸道北472号、474号、476号及478号合并发展,合并后地盘面积4,685方呎;若作商住物业,最高地积比率为9倍,即可建楼面42,165方呎。\n另早前城规会就古洞南分区计划大纲草图作修订,并暂录4份申述,其中新地(0016)对古洞南锦坑路一幅综合发展用地高限及密度的设定表示支持,惟有关范围内旗下一个已获准规划许可的住宅项目,因基建工程而令地盘面积略为缩减,建议放宽项目地积比,以补偿相关楼面损失。\n####\n永义国际代表(举牌者)昨以3亿元统一红磡漆咸道北472号、474号、476号及478号的业权,若作商住发展预计可建楼面约4.2万方呎。'], + [60202106020051615, + '中港互挂ETF利科技股 恒指曾上29300', + '中港两地ETF互挂拓展至上海,昨日在上海挂牌的华泰柏瑞南方东英恒生科技ETF(513130.SH),主要追踪恒生科指的南方恒生科技ETF(03033),该基金已在5月18日完成集资,集资金额达11.7亿元人民币,超过4万名投资者认购。受消息及潜在买盘影响,周二恒指在新经济股的带动下向好,一度重上29,300水平之上。\n参考个别轮证发行商网站,在截至2021年6月1日8时,录得单日最多资金流入恒指(好),金额约6,506万元;录得第2多资金流入的是美团(淡),约3,473万元。录得单日最多资金流出是恒指(淡),金额约11,275万元,录得第2多资金流出的是美团(好),约4,639万元。数据反映投资者趁恒指大幅波动时,把淡仓获利,并再建好仓。\n看好后市者,可留意恒指牛证66766,收回水平28,800元,到期日2024年1月30日。看淡后市者,可留意恒指熊证53101,收回水平29,718元,到期日2021年11月19日。'], + [60202106020373866, + '变相带头列打疫苗为聘用条件 金融监管机构迫交数立坏先例', + '香港疫苗接种率低,为达致群体免疫指标,港府近日“威迫利诱”各界尽快接种,表明一旦爆发第五波新冠肺炎疫情,抛出未接种者禁入食肆、戏院、学校、博物馆及体育馆等辣招,公众咋舌。一波未平,一波又起,香港金融管理局更加进取,强制银行在未来两周内提交准备接种疫苖的职员人数及部门,成为首个公营机构具体指令要交“接种清单”。纵使金管局辩称不是要求交职员姓名,亦无言明收集后跟进工作,但此举跟“交人名”并无分别,肯定成为施压“捽数”指标。金管局今次做法俨如带头列打疫苗为聘用条件,立坏先例。我们认为,每人体质不同,绝对不宜强接种疫苗,除非特别工种,如从事航空或旅游业,若职场列接种疫苗为聘用条件,后果堪忧,打工仔为了生计,只得拿自己性命作赌注。\n金管局要求银行应确认并拟定一份预定接受接种的指定员工名单,名单应包括但不限于涉及分行运作、财富管理和商业银行等,需与客户经常面见的职员,以及负责关键资讯科技、数据中心、资金和结算的人员操作。至于尚未接种或基于健康理由而不适合接种疫苗的员工,定期进行新冠病毒测试,是必要的风险管理措施。\n金管局指出,自昨天发布通告两周内,银行应按部门或职能细分已接种疫苗的职员总数,若有员工在6月30日前并未接种第一剂疫苗,应进行首次新冠肺测试。此外,尚未接种或基于健康理由而不适合接种的员工,则需要每两星期进行检测。\n我们认为,金管局强制规定要提交接种疫苖的职员人数及部门,虽然辩称不是要求交人名,但其实这跟“交人头”没有分别。试想想若果银行高层看着一些部门的接种人数低,肯定一层层压下去,即部门主管肯定想尽办法做好这盘数,威迫利诱下属接种,纵使不明言,当人事管理的,总有方法令到下属知难而退或屈服。\n香港金融管理局总裁余伟文昨发表文章时表示,指银行业与其他行业不同,即使疫情极为严峻之时,银行业仍会继续提供必要服务,尽管营运时间与规模有所缩减。有见及此,金管局已去信银行要求其大力鼓励需要亲身接触客户的员工,包括分行的员工,和执行关键支援功能的员工尽快接种疫苗。余强调,疫苗接种计划是推动经济复苏策略的基石之一。他指出,若接种疫苗率继续落后于纽约、伦敦或新加坡等其他主要国际金融中心,对于香港作为国际金融中心之一的竞争力将可能受到影响。因此,接种疫苗是为大众回复正常生活,以及达到经济可持续复苏作好准备的关键。金管局欢迎政府牵头给予政府雇员疫苗假期以鼓励接种,该局亦已推出措施便利和鼓励同事接种疫苗,包括让同事有两天额外假期。余伟文说得响亮,先例一开,后患无穷,动摇了劳工可享有的基本人权。'], + [60202106020373865, + '太古地产投放800万元鼓励市民接种疫苗', + '【本报记者报道】太古地产(1972)宣布,投放800万元鼓励市民及员工接种疫苗,提高疫苗接种率。太古地产称,8月底前完成接种两剂疫苗的香港居民身分证持有人,可参加抽奖,得奖名额500个,每人可获一万元现金券,在太古广场、太古城中心或东荟城名店仓使用。现金券可于发出后一年内使用,抽奖详情及相关条款细则稍后公布。\n太古地产继较早前向完成接种两剂疫苗的员工,提供一天特别假期后,会再向于8月底前完成接种两剂疫苗的香港合资格员工,派发2,000元商场现金券,同时为所有香港员工免费提供疫苗接种前的身体检查。太古地产旗下的太古酒店日前同样为其香港员工,提供接种前的健康评估,完成接种疫苗后亦可享用一至三天疫苗假期。为进一步提升接种率,已接种两剂疫苗的酒店员工将有机会参加大抽奖,赢取高达一万元的旅游奖赏、“宅度假”套票等30份丰富奖品。\n#冀助港经济重回正轨\n太古地产行政总裁白德利表示,提高疫苗接种率是遏止疫情的最有效办法。期盼为社会出一分力,透过连串鼓励措施支持香港特区政府的疫苗接种计划,并为员工、租户以及广大市民提供支援,让香港经济重回正轨。\n至于鹰君集团宣布由即日起至8月31日,在旗下酒店及商场推行一连串消费优惠。香港朗廷酒店、香港康得思酒店及香港逸东酒店将提供合共3,000晚酒店房间,以及其住宿餐饮的半价优惠,给予已完成接种两剂疫苗并能出示相关证明的香港居民,上述人士旗下酒店餐厅及明阁湾仔分店消费,亦可享八五折优惠。鹰君集团旗下冠君产业信托的朗豪坊提供消费百分之百奖赏,予已完成接种两剂疫苗的LPClub会员。会员于商场消费满100元,即可获最多100元朗豪坊商场或其商户之现金礼券,朗豪坊及后公布计划详情。\n####\n太古地产公布8月底前完成接种两剂疫苗的市民可参加抽奖,中奖者可获一万元商场现金券。'], + [60202106020374068, + '女医生王李丽明与已婚男病人出轨 自爆厨房性交', + '【本报记者报道】医务委员会昨日展开纪律聆讯,涉及医生王李丽明涉嫌在2013至15年,与同样已婚的顾客SaulDeweiQIU发生不正当关系,涉及医学专业人士行为不端,被控专业失当。王李丽明作供时表示,与对方属“浪漫关系”(RomanticRelationship),之后发展至性关系,首次是在某寓所内的厨房发生。\n王李丽明被控专业失当罪,案情称她与顾客发生不当关系,违反《香港注册医生专业守则》。\n原告GoreClaireVenessa,与前夫SaulDeweiQIU育有一名小孩,夫妇二人曾接受王李丽明负责的身体检查。不过,被告王李丽明某一天突然告知她,与她当时的丈夫曾发生关系。\nQIU作供时表示,由于事件已发生一段时间,仅透露大约在2014年进行身体检查,称事件发生的时间,需回家翻查记录才能确定。他强调,与王李丽明为朋友关系,称二人常于周末及假期见面,并于2012年开始透过通讯软件Wechat聊天,并逐渐互相了解对方,其后更发展至多次“一夜情”的性关系。控方律师多次问到性关系的具体行为及定义,QIU表示“已多次做爱”。\n研讯小组委员于是质疑“一夜情”一词,QIU反问委员“你是个男人,对吧?”至于辩方律师问到是否避免前妻Gore观看到聊天记录,因而使用内地通讯软件Wechat。他回应称,Wechat设有英文版本。\n王李丽明说,为QIU身体检查发生于2014年,不过二人早于2011年已是朋友关系,当时更偶尔相约于周末行山。她透露,二人在Wechat聊天时使用中文,平日会互相分享歌曲,亦彼此诉说心事及表达感受,更为对方设置特别的昵称,形容为“浪漫关系”。她透露,首次关系发生于某住所内的厨房,接吻后发生性关系。医委会决定押后至6月27日继续审理。\n####\n医务委员会昨日召开纪律聆讯。'], + [60202106020386328, + '梁定邦:警用武符国际惯例但可改善', + '【明报专讯】反修例事件触发大型社会冲突,衍生大量针对警方的投诉个案。刚卸任监警会主席的梁定邦接受电视台访问称,监警会去年发表专题审视报告,提出了52项建议,警方已落实一半,监警会将继续跟进执行情况。梁定邦说,监警会参考过国际惯例,认为警方使用武力实际上颇符合国际惯例,也符合法例,但认为可以改善。\n梁定邦接受明珠台节目《清心直说》访问,担任了监警会主席一职3年,因健康理由而未能做多一年。在他的任期内发生了反修例事件,他形容过去几年“确实是风风雨雨”。被问会否觉得监警会成为被争议的对象,他回应称任何公职都免不了受不同意见批评,“绝对是置于交火之中”。他重申监警会是依法行事,“为了解决问题,我们必须基于法律,我们必须基于证据、基于事实,否则我们可依靠什么?”\n称法庭已就案件裁决关注独立调查作用何在\n反修例示威者的五大诉求包括成立独立调查委员会,梁定邦表示,监警会撰写专题审视报告,并非为了让政府回避成立独立调查委员会。他称不知道政府现在还有无意欲成立独立调查委员会,若真的要成立,政府要厘定好职权范围。\n他关注若法庭已就一些案件作出裁决,独立调查委员会的作用何在;另外,若政府希望透过独立调查找出事件的推手,则牵涉政治问题,未必是委员会可处理。'], + [60202106020344618, + '英格兰料轻取奥地利', + '【大公报讯】据每日邮报报道:英格兰领队修夫基在周二决选欧洲国家杯最终26人大军名单,初选的33人中,已确定有曼联年轻锋将格连活特因伤退出,消息亦指利物浦右闸阿历山大阿诺特与今季外借韦斯咸表现优异的曼联中场连格,将双双落选。\n阿诺特或无缘欧国杯\n修夫基会在当地时间周二傍晚,把大军由33人缩减至官方上限的26人,其中19岁曼联锋将格连活特索性留在曼联养伤而自行退出,今季大部分时间表现欠佳的阿诺特,亦在右闸位置人脚过剩下势被剔走,较令人意外是连格亦可能无缘出征。此外,中场禾特普斯、后卫宾韦特、宾葛菲特及门将兰斯达尔,亦有大可能成为被筛走的另外4人。\n英格兰将于明晨在米杜士堡主场友赛另一入围欧国杯球队奥地利。修夫基应不会派遣刚踢完欧洲赛决赛的车路士、曼城和曼联等球员披甲,但凭哈利卡尼、查顿辛祖等亦足以攻破客军,而且奥地利有曾在英超比赛的射手阿拿奥杜域因伤缺阵,翼锋华伦天奴拿沙路受限于隔离条例未有赴英,料英军可轻取对手。(有线662及602台明晨3时直播)'], + [60202106020396136, + '大坂直美抑郁势弃东奥 退出法网无限期休战 细威咖喱仔各界送暖', + '世界“二姐”大坂直美自揭受抑郁症困扰3年,昨宣布退出法国网球公开赛,更会无限期休战。虽然她杯葛记者会的做法引起争议,但“细威”莎莲娜威廉丝、NBA球星史堤芬居里等开腔力撑。仅23岁的大坂近3年四夺大满贯,又因黑人血统为平权发声,但个性内敛的她终被压力推倒,下月能否在全球注视下主场出战东京奥运仍是问号。\n法网女单次号种子大坂直美早前以保护精神健康为由,拒绝参与赛后记者会,首圈后被罚1.5万美元(约11.6万港元),再面临四大满贯赛会的禁赛警告。她昨发声明宣布退赛,希望让焦点回归网球,又称“需远离球场一阵子”。她首次透露2018年首夺美国公开赛后,长期受抑郁困扰,“曾经历艰难的时间”,又因社交焦虑,比赛时常戴耳机。她续解释,为保护自己才决定缺席法网记招,向受伤害的记者致歉,但重申认为规则过时。\n细威感同身受居里:尊重杯葛决定\n“细威”法网首圈过关后,表示对大坂的处境感同身受,称“想给她一个拥抱”。“大威”维纳丝威廉丝、嘉奥芙等球手,以及退役“飞人”保特等留言支持,效力金州勇士的NBA球星“咖喱仔”史堤芬居里称,虽然大坂不应采取杯葛记者会的做法,但予以绝对尊重:“当权者未能提供保护,才令人踏上此途。”曾提醒受访是工作一部分的拿度,昨晚出战首圈前未再回应。\n奥运奖牌希望日官员称健康最重要\n大坂2018年于美网爆冷击败细威,收获首座大满贯奖杯,细威在该仗与主裁判争执,大坂捧杯时难敌全场嘘声洒泪,要由偶像细威安慰。大坂至今赢得4项大满贯,创亚洲女子网坛历史,场外亦屡成焦点,去年在《福布斯》收入榜力压“细威”居首,并打破舒拉宝娃的年度收入纪录。拥黑人血统的她在种族平权浪潮中高调发声,然而昨表示自觉并非天生演说家,每次面对传媒均感焦虑。其胞姊麻里早前撰文,称泥地赛成绩欠佳令妹妹饱受压力,上月罗马大师赛首圈出局更令她自信彻底崩溃。\n大坂直美原定下月主场争夺东奥奖牌,日本奥运代表团团长福井烈表示,对大坂休战感惊讶,但未知具体情况,盼对方早日重拾笑容。日本网球总会称大坂的健康是首要考量,内阁官房长官加藤胜信则表示,将“静静留意她的情况”。\n####\n大坂直美(圆图)发声明(上图)宣布退出法网,并自揭受抑郁症困扰3年。'], + [60202106020373873, + '中学校长会调查:逾7成家长及专业人士高度信任中学教师', + '【本报记者报道】前年反修例风波,屡有建制议员质疑教师专业,但香港中学校长会委托香港政策研究所进行的调查结果显示,逾7成家长及专业人士高度重视及信任中学教师;另有超过7成受访家长对教师在学生发展范畴,例如生涯计划等方面的表现抱有信心。中学校长会建议教育局,向教育人员专业操守议会赋予法定权力,例如制订教师专业标准、执行教师注册制度、处理纪律问题等,也可成立教师公会,提升教育工作者的专业地位及社会认同。\n中学校长会主席连镇邦说,社会须思考如何吸引新人入行延续优质教育,从调查可见家长亦期望成立教育专业机构,各持份者努力让香港下一代接受良好教育,让教育专业回归专业。\n中学校长会荣誉顾问、英华女校荣休校长李石玉如表示,不论社会及政治气氛如何转变,校长会仍坚定不移在乎教育专业,包括组织教育研讨会、跨界别进行交流。李石玉如勉励同工说:“每个时间有每个时间嘅挑战,努力啦!有时有困难都系好,时穷节乃现,反思下为咩教育,当反思及经历艰难仍留低,教育心更强。”\n教师心不死才可有良好服务\n被问及华英中学重建工程拨款疑因政治因素被抽起,是否反映政治凌驾教育专业?李石玉如表示,不宜评论单一事件或学校情况。至于学校仍有否空间讨论六四?她深信教师心不死,才可有良好服务,如何教育、是否教育也是专业判断,希望各界信任教育界,留有沟通空间。研究团队在2020年以问卷访问107位校长、443名现职老师、99名准教师及2,030名家长;另焦点访谈约59人,包括校长、老师、家长及专业人士。'], + [60202106020048897, + '《F9狂野时速(2D版)》 IIB', + '导演:林诣彬\n主演:云迪素、查理丝花朗、海伦美兰、米雪露芝姬丝\n片长:2小时23分\n故事简介阿当一家过着隐居生活,但好景不长,他“失散”多年的车神杀手亲生细佬雅各自动搵上门,随即引爆连场国际级大灾难。为击破雅各守护所爱,阿当马上召集原班队友回阵,走遍英、美、日,杀出阿塞拜疆、格鲁吉亚;旧友复现,前债再临,“一家人”冲击真正试炼,阿当誓要改写历史!手车再快,能否冲破过往枷锁?'], + [60202106020344330, + '遥远的相似性', + '有一次,霍金被问到什么最让他感动。他的回答是:遥远的相似性。当我看到这几个字时,忽然也有些感动。这位伟大的物理学家说的“遥远”当然以浩瀚宇宙为范围,而“相似性”指的估计也是天体,及其自然运行之状。不过,这句话的意义又不止于物理。\n同类相怜,大概是人性的内涵之一。当我们身处异地,却发现民俗风情类似于故乡时,常会心生欢喜。我至今记得,十多年前在陕北发现一种果乾,与老家的做法十分相似,那时物流尚不如今日发达,竟能在几千里外的异乡尝到家乡的味道,着实令我兴奋了一阵。\n类似的情况很多,并不限于吃食。比如,以前常从图书馆借书看,借来的书上有时写着批注,这些“注家”做好事不留名,破坏公共图书中整洁,自不值得提倡,却也偶有妙语,恰恰击中我读时的感受,如同心里有一面锣,被敲得哐当一响。现在,下载安装一款读书软件,便可以随时线上阅读,标识的段落、写下的感受,不复有破坏书本之虞,也能让更多的人读到观书心得。\n网络化的生活下,人人线上,于是,我们很容易就能和遥远的陌生人搭上话了,在浩瀚的人海中碰到相似的思想、观念和趣味的几率也增加了不少。博客刚兴起那会儿,我也开过一个,闲下来就在上面涂涂抹抹;有的时候,也四处闲逛,发现有共鸣的话题,就忍不住回应上一段。有时被博主发现了,顺藤摸瓜地找过来,一来二去,谈得更加深入,大有相见恨晚之感,实则从不曾谋面。这是一种精神上的相似性,虽遥远,也互给了对方不少鼓励。因为只是精神上的相投,从未有奔现的愿望。彷佛古人外出游历,在寺墙上见了旁人题下的诗句,有所感触,便也留上几句,神交足矣,无需面遇。\n遥远的相似令人期待,大概也因符合距离产生美的定律。如应和者就在身旁,如说相声的捧哏之于逗哏,滋味就会大不一样了吧。\n逢周一、三、五见报'], + [60202106020373879, + '华为前雇员王伟晶被控当中国间谍 波兰受审', + '【本报记者报道】中国电讯设备供应商华为的前波兰销售主管王伟晶,以及一名波兰网络安全专家,被控为中国做间谍,在首都华沙法院受审。两人均否认控罪。控罪称,王伟晶涉嫌为中国从事间谍活动超过七年,试图加强华为对波兰政府的影响力,并使中国能够管理波兰的科技基础设施。\n检方表示,王伟晶(39岁)还被指控招募了一名前波兰特工,这名特工向他透露了影响波兰救援和公共安全服务无线电网络的方法。王伟晶自被捕以来一直被扣押。这名波兰被告PiotrD.曾在政府高层工作多年,他被指控成为了公共行政方面的消息来源。\n检察官说,这名根据隐私规则要求不透露姓氏的网络安全专家告知了王伟晶一个监控系统,这个监控体系是为了防止入侵者接触到通过华沙军事大学创建的光纤通信网络发送的机密信息。\n两人均否认有任何不当行为。王伟晶的律师扬科夫斯基(BartlomiejJankowski)表示,检方没有证据证明他的当事人有任何间谍活动。他表示,“没有证据表明有任何违法行为”。华为曾强调事件,属个人行为,已将王伟晶解雇。\n####\n王伟晶是华为波兰分公司前高层。\n图为北京市联想桥附近的一间电器商店内,华为与荣耀手机销售专区。'], + [60202106020048925, + '百应控股订购后回租交易 涉2,360 万人币', + '百应控股(08525)公布,于5月31日收市后,间接全资附属作为买方,根据融资租赁协议与黔东南交通订立一项售后回租交易。\n根据融资租赁协议,买方将按2,360万元人民币的代价向黔东南交通购买租回资产,并向黔东南交通出租租回资产,为期12个月,并收取租赁款项作为回报。'], + [60202106020482062, + 'OFFICIAL LIST', + 'Change in board lot size and parallel trading\nImperium Group Global Holdings: The board lots for trading in the ordinary shares will be changed from 1,000 shares to 500 shares, accordingly, the parallel trading in the ordinary shares begins today under: Imperium Gp-500 (776) and Imperium Gp-1K (2925).\nNew listing\nTa Yang Group Holdings rights (2927): Trading in the nil paid rights begind today.\nCS-HSBC@EP2109B (13715), MS-CCB @EC2110A (13723), CS-Tenct@EC2109F (13724), UB-FEIHE@EC2111A (13730), UB-HSTEC@EP2112A (13732), UB-Xiami@EC2112B (13736), UB-CKH @EC2110A (13737), MB-CICC@EC2202A (13738), MB-Alihi@EC2112A (13740), MB-FOPHA@EC2203A (13742), CS-BYD @EP2111B (13757), CS-MYCG@EC2202A (13768), CS-Tenct@EC2110E (13769), CS-HSBC@EP2110A (13770), CS-BYDEI@EC2112B (13771), CS-PSBC@EC2112A (13774), CS-CKA @EC2112B (13778), CS-JDHI@EC2112A (13786), BP-Aliba@EC2204B (13793), BP-LENOV@EC2112A (13795), BP-BYD @EC2110D (13797), MS-PSBC@EC2112A (13799), MS-ICBC@EC2203A (13803), SG-BYDEI@EC2203A (13807), SG-CMB @EC2111A (13810), SG-HUAHO@EC2112A (13812), SG-SMIC@EC2112C (13813), SG-Xiami@EC2111B (13817), BP-GEELY@EC2112A (13818), BP-KUASO@EC2112C (13822), BP-HSBC@EC2208A (13827), BP-HSBC@EC2202A (13835), BP-Xiami@EC2112B (13842), BP-Xiami@EC2202B (13843), BP-SMIC@EC2112A (13844), BP-SMIC@EC2201B (13847), JP-HSBC@EP2112A (13848), JP-HSBC@EP2110A (13850), JP-COVS@EC2201A (13851), JP-CMB @EC2112A (13852), BI-JIANC@EC2110A (13855), BI-LENOV@EC2109A (13860), HT-MNIU@EC2110A (13864), HT-JDCOM@EC2110A (13865), GS-Xiami@EC2109B (13866), GS-Xiami@EC2109C (13873), GS-KUASO@EC2110D (13877), GS-Tenct@EC2111A (13878), GS-BYDEI@EC2112A (13881), GS-SHKP@EC2109A (13882) and GS-Aliba@EC2201A (13884): Dealing in the derivative warrants begins today.\nBP#HSI RP2109G (67369), BP#HSI RC2408Q (67370), HS#HSI RP2201F (67371), HS#MtuanRP2111U (67372), HS#MtuanRC2111L (67373), HS#XiamiRC2112Y (67376), HS#HSI RC2310X (67380), HS#HSI RC2310F (67382), CS#HSI RP2109D (67384), CS#HSI RP2110F (67385), CS#MtuanRC2110C (67386), CS#AlibaRC2110Q (67387), CS#HSBC RC2112G (67392), CS#HSBC RC2112H (67393), CS#HSBC RP2112C (67396), CS#GEELYRC2202D (67398), CS#AlihiRC2112E (67399), CS#TenctRC2109P (67401) and CS#TenctRP2112E (67402): Dealing in the callable bull/bear contracts starts today.\nParallel trading\nSmart City Development Holdings: The counter for trading in the consolidated shares stock code 8600 as represented by old share certificates will be withdrawn after the close of business today and trading in the shares will only be under the stock code 8268.\nSuspension of trading\nChampion Technology Holdings (92) and Kantone Holdings (1059): Trading in their shares were suspended yesterday pending of an announcement pursuant to Chapter 14 of the Rules Governing the Listing of Securities on The Stock Exchange of Hong Kong Limited and the Hong Kong Code on Takeovers and Mergers, which contains certain inside information.\nCTR Holdings (1416): Trading in the shares was suspended yesterday pending of further notice.\nThe Sincere (244): Trading in the shares was suspended yesterday pending the publication of the annual results of the Company and its subsidiaries for the year ended 28 February 2021.'], + [60202106020354139, + '世卫用希腊字母称呼变种病毒 避免污名化 不与发现地点相联系', + '【大公报讯】综合法新社、《卫报》、路透社报道:全球新冠疫情发生一年多以来,多国相继出现变种新冠病毒毒株。为避免最早发现变种的国家被污名化,世界卫生组织(WHO)5月31日宣布,将以希腊字母来称呼全球各地新冠变种毒株。世卫新冠疫情技术负责人范克尔霍夫表示,“任何国家都不应该因为检测和通报新冠病毒变异体而被污名化。”另外,英国近期疫情出现反弹迹象,新增病例大部分涉及在印度发现的变种病毒B.1.617.2,恐令6月21日最终阶段的解封计划延迟。\n自新冠疫情发生以来,中国无疑成为新冠病毒“污名化”的最大受害者。全球多国发现新冠变种病毒后,媒体报道中也经常将变种病毒与发现的地方挂钩。为了避免污名化,近几个月以来,世卫一直在讨论变种重新命名工作。\n世卫5月31日决定以希腊字母来命名新冠变种病毒,根据被发现的时间,按照24个希腊字母的顺序排序。例如,希腊字母“Alpha”将代表全球最早、在英国首先发现的变种B.1.1.7,“Beta”代表首现于南非的B.1.351,“Gamma”则代表首现于巴西变种病毒的P.1,而首现于印度的B.1.617.2变种病毒被称作“Delta”。当希腊字母表的24个字母用完时,世卫将引用新方式来命名其他变种。\n曾考虑用希腊诸神命名\n世卫表示,变种毒株的科学名称难记且容易误报,加上民众经常使用首先发现地来称呼变种毒株,带来污名化和歧视性的不良影响。为避免上述情况并简化公共交流,当局才鼓励各国政府、媒体和其他机构采用新的名称。\n世卫新冠疫情技术负责人范克尔霍夫在推特上强调,“任何国家都不应该因为检测和通报新冠病毒变异体而被污名化。”她说,新命名方法为了帮助公众讨论,并不会取代现有的科学名称。她还呼吁各国对变种病毒进行“强有力的监测”,并分享科学数据以帮助阻止其传播。\n世卫实验室高级研究员科宁斯表示,最初的计划是自行创造双音节的“混成词”,但后来发现名字大多已经被公司、地点甚至是民众使用。此外,世卫亦曾考虑过用希腊神话里诸神的名字,以及用阿拉伯数字来命名变种病毒。但针对后一种方案,世卫认为阿拉伯数字可能会与基因测序的数据混淆而最终放弃。\n英解封恐因变种病毒推迟\n另外,世卫总干事谭德塞致年度大会闭幕辞表示,“缺乏数据、信息、病原体、技术和资源的分享”是疫情大流行本质特征,而“(大流行)协议将促进共享、信任和问责,并为建立其他全球卫生安全机制奠定坚实的基础。”他介绍称,目前已有60多个国家表态支持,成员国还将在11月29日举行会议进一步探讨此事。\n因应在印度发现的变种病毒B.1.617.2,英国近期疫情出现反弹迹象,5月31日新增3383宗确诊病例,过去一周内共录得23418宗确诊病例,较此前一周增加28.8%。英国政府新兴呼吸道病毒威胁谘询小组(Nervtag)成员古普塔教授表示:“(英国的)新增确诊病例数量呈指数型增长,其中至少四分之三是新变种个案。”\n他指出,英国现在已经处于第三波疫情的开端,但由于疫苗接种率高,所以可能会比前几波疫情需要更长时间爆发。截至6月1日,英国已有59%的人口至少打完一剂疫苗。\n根据政府计划,英国将在6月21日进入最后解封阶段,取消所有社交距离限制。首相约翰逊将在6月14日决定是否推进这一最后阶段。古普塔表示,现在英国的接种率距离能够控制疫情传播的70%水平已经不算太远,应该将解封计划推迟“几个星期”。\n####\n5月27日,东京街头佩戴口罩的民众。\n疫情期间美国纽约民众佩戴口罩外出。'], + [60202106020386454, + '恒地员工打齐两针 可获4天假期', + '恒基地产(0012)联席主席李家诚在昨日公司举行的股东会后,被问到有什么措施鼓励市民接种疫苗,他指集团既不停鼓励员工接种新冠肺炎疫苗,亦让打齐两针的同事可享4天假期,甚至积极构想多元化的方法,鼓励市民接种疫苗。不过,他较早前因为患上感冒,因此未能打针,但亦打算安排接种。另一联席主席李家杰表示,已于3月份接种两针疫苗。\n预告续增持美丽华\n新冠疫情肆虐,李家诚称,集团旗下酒店的出租率受疫情影响较多,一般出租率只有30%至40%,到周末时才有70%,平日只有20%至30%。不过,随着较早前增持美丽华(0071)至超过五成后,他指出,集团长远会逐步增持美丽华,并且看好美丽华前景及多元化业务,不会急切收购或展开其他行动。李家杰补充,集团未有计划增持系内其他公司如煤气(0003)及小轮(0050)。随着恒地持股33%的恒艾健康旗下内地健康产业园落实发展,李家杰表示,这可以提供空间,供集团尝试新发展。\n对于政府推出中环海滨商业扡招标,李家诚指有兴趣参与。他亦对香港楼市看法颇为正面,并谓集团今年售楼理想,盼完成销售目标。李家杰又称,恒地多年来尽快及不停将手上农地转为熟地,尽管政府出手收回部分新界土地,他认为,若用来做基建设施及公屋,他们基于公众利益亦会赞成及同意,集团未有因此遇上损失。至于恒地何时恢复送红股,副主席林高演表示将视乎业绩而定。']] + diff --git a/gomate/modules/clusters/generata_report.py b/gomate/modules/clusters/generata_report.py new file mode 100644 index 0000000..2030f68 --- /dev/null +++ b/gomate/modules/clusters/generata_report.py @@ -0,0 +1,101 @@ +import json +import os + +import pandas as pd +import requests +from tqdm import tqdm + + +class LLMCompressApi(): + def __init__(self): + self.prompt_template = """ + 分析以下新闻标题列表,提取它们的共同主题。生成一个简洁、准确且不超过10个字的主题标题。 + 新闻标题: + {titles} + 主题标题: + """ + # self.api_url = '' + # 根据自己api地址修改 + self.api_url = 'http://10.208.63.29:8888' + + def compress(self, titles): + titles = "\n".join(titles) + prompt = self.prompt_template.format(titles=titles) + # ====根据自己的api接口传入和输出修改==== + + data = { + "prompt": prompt, + } + # loguru.logger.info(data) + post_json = json.dumps(data) + response = requests.post(self.api_url, data=post_json, timeout=600) # v100-2 + response = response.json() + # =====根据自己的api接口传入和输出修改=== + + return response + + +class LLMReportApi(): + def __init__(self): + self.prompt_template = """ + 请根据以下提供的新闻素材,编写一份主题报告,内容贴切主题内容,不少于50字。 + + 新闻素材: + {contexts} + + 主题报告: + """ + # self.api_url = '' + # 根据自己api地址修改 + self.api_url = 'http://10.208.63.29:8888' + + def compress(self, titles, contents): + contexts = '' + for title, content in zip(titles, contents): + contexts += f'标题:{title},"新闻内容:{content}\n' + + prompt = self.prompt_template.format(contexts=contexts)[:4096] + # ====根据自己的api接口传入和输出修改==== + + data = { + "prompt": prompt, + } + # loguru.logger.info(data) + post_json = json.dumps(data) + response = requests.post(self.api_url, data=post_json, timeout=600) # v100-2 + response = response.json() + # =====根据自己的api接口传入和输出修改=== + return response + + +dfs = [] + +for file in os.listdir('level2'): + if file.endswith('.xlsx'): + df = pd.read_excel(f'level2/{file}') + dfs.append(df) + +df = pd.concat(dfs, axis=0).reset_index(drop=True) +print(df.columns) +llm_api = LLMCompressApi() +llm_report = LLMReportApi() +with open('result/cluster_level1_index.jsonl', 'w', encoding="utf-8") as f: + for index, group in tqdm(df.groupby(by=["cluster_level1_index"])): + titles = group['title'][:30].tolist() + response1 = llm_api.compress(titles) + titles = group['title'][:5].tolist() + contents = group['title'][:5].tolist() + response2 = llm_report.compress(titles, contents) + + f.write(json.dumps({"cluster_level1_index": index, "level1_title": response1["response"].strip(), + "level1_content": response2["response"].strip()}, ensure_ascii=False) + "\n") + +with open('result/cluster_level2_index.jsonl', 'w', encoding="utf-8") as f: + for index, group in tqdm(df.groupby(by=["cluster_level2_index"])): + titles = group['title'][:30].tolist() + response1 = llm_api.compress(titles) + titles = group['title'][:5].tolist() + contents = group['title'][:5].tolist() + response2 = llm_report.compress(titles, contents) + f.write(json.dumps({"cluster_level2_index": index, "level2_title": response1["response"].strip(), + "level2_content": response2["response"].strip()}, ensure_ascii=False) + "\n") diff --git a/gomate/modules/clusters/get_es_data.py b/gomate/modules/clusters/get_es_data.py new file mode 100644 index 0000000..64fe204 --- /dev/null +++ b/gomate/modules/clusters/get_es_data.py @@ -0,0 +1,32 @@ +import requests +import json +import pandas as pd +url='http://10.208.61.117:9200/document_share_data_30_news/_search?q=%E7%89%B9%E6%9C%97%E6%99%AE&size=3000&sort=publish_time:desc' + +response=requests.get(url) + + +with open('data.json','w',encoding='utf-8') as f: + json.dump(response.json(),f,ensure_ascii=False,indent=4) + + +with open('data.json','r',encoding='utf-8') as f: + data=json.load(f) + + +sources=[hit['_source'] for hit in data['hits']['hits']] +print(len(sources)) + + + +source_df=pd.DataFrame(sources) +print(source_df) +print(source_df.columns) +print(source_df['paragraph_ids']) + +source_df['id']=source_df.index +source_df['id']='source_'+source_df['id'].astype(str) + +print(source_df) + +source_df[['id','title','content']].to_excel('data.xlsx') \ No newline at end of file diff --git a/gomate/modules/clusters/libraries/display.py b/gomate/modules/clusters/libraries/display.py new file mode 100644 index 0000000..c2cdb59 --- /dev/null +++ b/gomate/modules/clusters/libraries/display.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 11:38 +# description:"通用打印函数" +from libraries.timer import get_now_time + + +def usual_print(msg=None, prompt=None): + if prompt: + print("{}:{}------>:{}".format(get_now_time(), prompt, msg)) + else: + print("{}:------>:{}".format(get_now_time(), msg)) + +def save_print(save_dir): + print("{}:正在保存到路径------->:{}".format(get_now_time(), save_dir)) + +if __name__ == '__main__': + usual_print('2021/05/25.csv',"保存文件") + usual_print('2021/05/25.csv', ) diff --git a/gomate/modules/clusters/libraries/labels.py b/gomate/modules/clusters/libraries/labels.py new file mode 100644 index 0000000..3dcaaf5 --- /dev/null +++ b/gomate/modules/clusters/libraries/labels.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 15:15 +# description:"do something" +import os + +import pandas as pd +from tqdm import tqdm +from libraries.utils import find_lcsubstr + + +def load_corpus(): + corpus_dir = './data/month/csv/' + tmp = [] + for csv in os.listdir(corpus_dir): + csv_df = pd.read_csv(corpus_dir + '/' + csv, error_bad_lines=False) + tmp.append(csv_df) + df = pd.concat(tmp, axis=0) + + df['label'] = df['label'].apply(lambda x: "".join(x.split('、')[1:])) + past_labels = [] + for label in list(set(df['label'])): + past_labels.append(label) + return df, past_labels + + +def load_noduplicated_labels(data): + # print("构建历史相似label...") + # data, _ = load_corpus() + + # 将 港报其他要闻 替换成 港报要闻 + data['label'] = data['label'].apply(lambda x: x.replace('港报要闻', '港报其他要闻') if '港报要闻' in x else x) + data['label'] = data['label'].apply(lambda x: x.split('_')[0] + '_中央及特区政府' if '特区政府' in x else x) + # 合并相似label + tmp = pd.DataFrame(data.label.value_counts()) + # print(tmp) + tmp['value'] = tmp['label'] + tmp['label'] = tmp.index.values.tolist() + tmp['label1'] = tmp['label'].apply(lambda x: x.split('_')[0]) + tmp['label2'] = tmp['label'].apply(lambda x: x.split('_')[1]) + tmp = tmp.reset_index(drop=True) + # print(tmp) + # ('非常任法官施觉民辞', 9) + new_tmp = tmp.copy() + match_res = [] + for index, row in tmp.iterrows(): + tmp_res = set() + tmp_res.add(row.label) + for new_index, new_row in new_tmp.iterrows(): + try: + # if row.label == '新冠肺炎_疫情' and new_row.label == '新冠肺炎_香港疫情': + # print(row.label, new_row.label, row.label1 == new_row.label1) + if row.label1 == new_row.label1: + if row.label2[0] == new_row.label2[0]: # 判断label的开头是否相同 + res = find_lcsubstr(row.label2, new_row.label2) + if res[1] >= 3: + tmp_res.add(new_row.label) + elif len(row.label2) <= 2 or len(new_row.label2) <= 2: + # 新冠肺炎_香港疫情 新冠肺炎_疫情|修例风波_检控 修例风波_私人检控|港报要闻_港大 港报要闻_香港大学 + res = find_lcsubstr(row.label2, new_row.label2) + if res[0] == new_row.label2 or res[0] == row.label2: + # print(row.label, new_row.label) + tmp_res.add(new_row.label) + except Exception as e: + print(row) + match_res.append(sorted(tmp_res)) + tmp['match_label'] = match_res + existed_label = tmp['match_label'].values.tolist() + + def is_contained(label): + for ex in existed_label: + if set(ex) & set(label): + if ex != label and len(ex) > len(label): + return ex + return label + + tmp['final_label'] = tmp['match_label'].apply(lambda x: is_contained(x)) + tmp['final_label'] = tmp['final_label'].apply(lambda x: x[-1]) + # print(tmp[['final_label','match_label']]) + # print(tmp['final_label'].nunique()) + tmp.to_csv('data/outputs/labels/labels_dropdup.csv', index=False) + label_dic = dict(zip(tmp['label'], tmp['final_label'])) + return label_dic + + +def drop_duplicated_labels(data): + print("========================") + print("去除label之前文章label个数:", data['label'].nunique()) + label_dict = load_noduplicated_labels(data) + data['label'] = data['label'].apply(lambda x: label_dict[x] if x.strip() in label_dict else x) + # data['final_label'] = data['label'].map(label_dict) # 会导致有些label为空 + # print(data.shape[0] - data.count()) + print("去除label之后文章label个数:", data['label'].nunique()) + print("========================") + # print(data.shape[0] - data.count()) + return data diff --git a/gomate/modules/clusters/libraries/preprocessing.py b/gomate/modules/clusters/libraries/preprocessing.py new file mode 100644 index 0000000..07f242e --- /dev/null +++ b/gomate/modules/clusters/libraries/preprocessing.py @@ -0,0 +1,177 @@ +import re +import os +import jieba +from ltp import LTP +import logging +jieba.setLogLevel(logging.INFO) + +def REstrip(text): + # # 去掉首尾的param + # text0 = '' + # i = 0 + # remove_nota = u'[’·!"#$%&\'()*+,-./:;<=>?@,。?★、…【】()《》?“”‘’![\\]^_`{|}~]+' + # while text0 != text or i == 0: + # i += 1 + # text0 = text + # text = text.strip().strip(string.punctuation) + # text = text.strip(remove_nota) + # mo = re.compile(r'^([' + str(param) + r']*)(.*?)([' + str(param) + ']*)$') + # result = mo.search(text) + # text = result.group(2) + # return text + if ' ' in text: + if len(text) > 10: + text = text.split(' ')[0] + else: + text = text + if ':' in text: + cands = text.split(':') + if len(cands) > 1: + if len(cands[-1]) > 1: + text = cands[-1] + else: + text = text + else: + text = cands[0] + if ':' in text: + cands = text.split(':') + if len(cands) > 1: + text = cands[-1] + else: + text = cands[0] + if ' ' in text: + text = text.split(' ')[0] + + # title 预处理 + punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}' + # text = re.sub(r"[%s]+" % punc, " ", text) + text = re.sub(u"\\(.*?)|\\{.*?}|\\[.*?]|\\【.*?】", "", text) + return text + + +def load_stopwords(): + base_path = os.getcwd().replace('\\', '/') + # with open(base_path + '/data/stopwords/hit_stopwords_custom.txt', 'r', encoding='utf-8') as f: + with open(base_path + '/data/stopwords/baidu_stopwords.txt', 'r', encoding='utf-8') as f: + words = f.read().split('\n') + return words + + +# stop_words = load_stopwords() + + +def remove_blank_lines(input_str): + """ + 去除空行,去除多余空字符,去除\n \t + :param input_str: 输入文本 + :return: + """ + # 去除换行符以及多余空白字符 + input_str = ''.join(input_str.split()).strip() + return input_str + + +# remove_blank_lines(text) + + +def remove_news_stops(input_str): + """ + # 去除新闻报道中常用的文字 + # https: // blog.csdn.net / lzz781699880 / article / details / 105405793 + str = '氯化锂(3项)' + name = re.sub(\(.*?项\),'',str) + print(name) 氯化锂 + :param input_str: + :return: + """ + pattern1 = re.compile(r'【.*?讯】|(记者.*?)') + input_str = re.sub(pattern1, '', input_str) + # pattern2 = re.compile(r'^.*讯|^记者.*报道') + pattern2 = re.compile(r'^.{0,10}讯|^记者.*报道') + input_str = re.sub(pattern2, '', input_str) + input_str = re.sub('栏名:', '', input_str) + input_str = re.sub('作者:', '', input_str) + return input_str + + +# remove_news_stops(text) + + +def remove_stopwords(input_str): + words = [w for w in jieba.cut(input_str)] + stop_words = load_stopwords() + words = [w for w in words if w not in stop_words] + return " ".join(words) + + +def remove_punctuation(input_str): + """ + 去除标点符号 + https://zhuanlan.zhihu.com/p/53277723 + :param input_str: + :return: + """ + pattern = "[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——!\\\,。=?、:“”‘’¥……()《》【】「」『』]" + pattern = re.compile(pattern) + sentence = re.sub(pattern, '', input_str) + sentence = sentence.strip() + return sentence + + +def text_tokenizer(input_str, remove_stop=True, remove_punc=True): + """ + 实现中文文本分词 + :param input_str: 输入文本 + :param remove_stop: 是否去除停用词 + :param remove_punc: 是否去除标点符号 + :return: + """ + # input_str = remove_blank_lines(input_str) + input_str = remove_news_stops(input_str) + if remove_stop: + input_str = remove_stopwords(input_str) + if remove_punc: + input_str = remove_punctuation(input_str) + input_str = " ".join([w for w in input_str.split() if w]) + # input_str = " ".join(input_str.split()) + return input_str + + +# text_tokenizer('A burden for Biden') + + +article = """【东方日报专讯】【本报讯】中国海警上周日(23日)拘捕十二名人蛇,当中包括早前涉嫌违反《港区国安法》而被捕的「香港故事」成员李宇轩,以及多名涉反修例案件的人士。昨有报道引述消息称,偷渡案的幕后黑手怀疑是一名台北牧师,并称该牧师与本港违法占领行动(占中)其中一名发起人朱耀明相熟。朱发表声明,否认有参与该十二名人士的偷渡安排,又称并不认识该牧师。 +称不认识涉案台北牧师朱耀明昨透过声明表示,有关报道涉及其本人的内容完全失实,又批评该报道暗示他参与安排偷渡计划,做法居心叵测。朱重申并不认识该牧师,亦从未与他见面或以其他方式联系,而其本人无参与该十二名人士的偷渡安排。 +涉及计划偷渡到台湾而被捕的十二人中,「香港故事」成员李宇轩于本月十日被捕,当日警方国家安全处亦以涉违《港区国安法》为由,拘捕包括壹传媒黎智英等多人,并于翌日通缉朱耀明之子、「香港民主委员会」总监朱牧民。 +此外,朱牧民亦早于七月三十一日,联同逃亡英国的前香港众志常委罗冠聪等另外五人,因涉违《港区国安法》而被通缉。""" + + +def gen_first_para(input_str, text_length=None): + """ + 输入一篇文章内容 获取第一段 + :param text_length: 目标文本长度 + :param input_str: 输入文章内容 + :return: + """ + ltp = LTP() + + input_str = text_tokenizer(input_str) + sents = ltp.sent_split([input_str]) + final_str = '' + if text_length: + for sent in sents: + if len(final_str) < text_length: + final_str += sent + else: + break + else: + final_str = ''.join(sents) + + # sents = input_str.split(' ') + # final_str = remove_punctuation(final_str) + print(final_str) + print("==============" * 20 + "\n") + return final_str + +# gen_first_para(article) +# remove_punctuation(article) diff --git a/gomate/modules/clusters/libraries/timer.py b/gomate/modules/clusters/libraries/timer.py new file mode 100644 index 0000000..5e66c96 --- /dev/null +++ b/gomate/modules/clusters/libraries/timer.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 11:05 +# description:"时间相关函数" +import time +import datetime +import pandas as pd +from dateutil.relativedelta import relativedelta + + +def get_dt2ts(time_str): + """ + 将日期转为时间戳 单位ms + :param time_str: 2020-12-22 00:00:00 or 2020-12-22 + :return:1608566400000 + """ + if len(time_str) == 19: + datetime_obj = datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") + elif len(time_str) == 10: + datetime_obj = datetime.datetime.strptime(time_str, "%Y-%m-%d") + else: + datetime_obj = None + print("日期格式不正确") + obj_stamp = int(time.mktime(datetime_obj.timetuple()) * 1000.0 + datetime_obj.microsecond / 1000.0) + return obj_stamp + + +def convert_ts2dt(timestamp=1619193600000, is_ms=False): + """ + 将时间戳转为日期 + :param timestamp: + :return: + """ + # timestamp = 1606320000000 + time_local = time.localtime(timestamp / 1000) + # 转换成新的时间格式(2016-05-05 20:28:54) + if is_ms: + dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) + else: + dt = time.strftime("%Y-%m-%d", time_local) + return dt + + +def get_next_day(begin_date): + """ + 获取输入日期的下一天日期 + :param begin_date:2021-05-24 + :return:2021-05-25 + """ + if len(begin_date) == 19: + dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d %H:%M:%S") + elif len(begin_date) == 10: + dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d") + else: + dt = None + print("日期格式不正确") + dt = dt + datetime.timedelta(1) + end_date = dt.strftime("%Y-%m-%d") + return end_date + + +def get_standardtime_by_offset( + date='2021-04-24', + type=1, + year=0, + month=0, + day=0, + hour=0, + minute=0, + second=0, +): + ''' + 根据现在时间和设定偏移量获取标准时间 + :param type:偏移类型,1为加法,其他为减法 + :param year:年 + :param month:月 + :param day:日 + :param hour:小时 + :param minute:分钟 + :param second:秒 + :return:如1970-01-01 00:00:00 + ''' + if len(date) == 19: + dt = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S") + elif len(date) == 10: + dt = datetime.datetime.strptime(date, "%Y-%m-%d") + else: + dt = None + print("日期格式不正确") + if type == 1: + return (dt + relativedelta( + years=year, + months=month, + days=day, + hours=hour, + minutes=minute, + seconds=second + )).strftime("%Y-%m-%d %H:%M:%S") + else: + return (dt - relativedelta( + years=year, + months=month, + days=day, + hours=hour, + minutes=minute, + seconds=second + )).strftime( + "%Y-%m-%d %H:%M:%S") + + +def get_dates_range(start_date='2020-12-10', end_data='2020-12-20'): + """ + 获取开始日期和结束日期所有的日期列表,包括边界日期 + :param start_date:2020-12-10 + :param end_data:2020-12-20 + :return:['2020-12-10',....'2020-12-19','2020-12-20] + """ + return pd.date_range(start=start_date, end=end_data).astype(str).values.tolist() + + +def get_today(): + """ + 返回今天日期 2021-05-24 + :return:2021-05-24 + """ + today = time.strftime('%Y-%m-%d', time.localtime()) + return today + + +def get_now_time(): + """ + 获取当前时间 + :return: + """ + return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + + +def get_window_days(end_date='2020-12-21', window_size=10): + """ + 获取距离日期end_date的前window_size的历史日期 + :param end_date:2020-12-21 + :param window_size:10 + :return:['2020-12-11', '2020-12-12', '2020-12-13', '2020-12-14', '2020-12-15', + '2020-12-16', '2020-12-17', '2020-12-18', '2020-12-19', '2020-12-20'] + """ + return pd.date_range(end=end_date, periods=window_size + 1).astype(str).values.tolist()[:-1] + + +def get_today_lastdays(period=10): + """ + + :param period: + :return: + """ + today = time.strftime('%Y-%m-%d', time.localtime()) + res = get_window_days(today, window_size=period) + return res + + +if __name__ == '__main__': + print("日期转时间戳", get_dt2ts('2020-12-22 00:00:00')) + print("时间戳转日期", convert_ts2dt(1619193600000)) + print("获取输入日期的下一天日期", get_next_day('2021-05-24')) + print("获取指定偏移量的未来日期", get_standardtime_by_offset(date='2021-05-24', day=1)) + print("获取时间列表", get_dates_range('2020-12-10', '2020-12-20')) + print("获取今天日期", get_today()) + print("获取当前时间", get_now_time()) + print("获取指定日期的历史窗口日期列表", get_window_days()) + print("获取今天的历史窗口日期列表", get_today_lastdays()) diff --git a/gomate/modules/clusters/libraries/utils.py b/gomate/modules/clusters/libraries/utils.py new file mode 100644 index 0000000..6410e63 --- /dev/null +++ b/gomate/modules/clusters/libraries/utils.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 12:59 +# description:"do something" +import gc +import logging +import os +import random + +import numpy as np +from joblib import Parallel, delayed +from libraries.timer import * +from tqdm import tqdm + +tqdm.pandas() + + + +def check_mkdir(path): + if not os.path.exists(path): + os.mkdir(path) + else: + print("路径{}已存在".format(path)) + +def logger_config(log_path, logging_name): + ''' + 配置log + :param log_path: 输出log路径 + :param logging_name: 记录中name,可随意 + :return: + ''' + ''' + logger是日志对象,handler是流处理器,console是控制台输出(没有console也可以,将不会在控制台输出,会在日志文件中输出) + ''' + # 获取logger对象,取名 + logger = logging.getLogger(logging_name) + # 输出DEBUG及以上级别的信息,针对所有输出的第一层过滤 + logger.setLevel(level=logging.INFO) + # 获取文件日志句柄并设置日志级别,第二层过滤 + handler = logging.FileHandler(log_path, encoding='UTF-8') + handler.setLevel(logging.INFO) + # 生成并设置文件日志格式 + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + # console相当于控制台输出,handler文件输出。获取流句柄并设置日志级别,第二层过滤 + console = logging.StreamHandler() + console.setLevel(logging.INFO) + # 为logger对象添加句柄 + logger.addHandler(handler) + logger.addHandler(console) + return logger + + + +def apply_parallel(dfGrouped, func): + with Parallel(n_jobs=8, backend='multiprocessing', verbose=10) as parallel: + retLst = parallel(delayed(func)(group) for name, group in dfGrouped) + result = pd.concat(retLst) + del retLst + gc.collect() + return result + + +def get_batch_data(data=None, batch_size=None, shuffle=False): + """ + 产生批数据 + :param data: 输入数据 列表 + :param batch_size: 批数据大小 + :param shuffle: 是否打乱数据 + :return: + """ + rows = len(data) # 数据条数 + indices = list(range(rows)) + # 是否打乱 + if shuffle: + random.seed(2020) + random.shuffle(indices) + while True: + batch_indices = np.asarray(indices[0:batch_size]) + indices = indices[batch_size:] + indices[:batch_size] + + print(indices) + + data = np.asarray(data) + temp_data = data[batch_indices] + yield temp_data.tolist() + + +def gen_batch_data(data=None, batch_size=32): + """ + 生成batch list数据 + :param data: + :param batch_size: + :return: + """ + l = len(data) + for ndx in range(0, l, batch_size): + yield data[ndx:min(ndx + batch_size, l)] + + +def find_lcsubstr(s1, s2): + m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] # 生成0矩阵,为方便后续计算,比字符串长度多了一列 + mmax = 0 # 最长匹配的长度 + p = 0 # 最长匹配对应在s1中的最后一位 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i + 1][j + 1] = m[i][j] + 1 + if m[i + 1][j + 1] > mmax: + mmax = m[i + 1][j + 1] + p = i + 1 + return s1[p - mmax:p], mmax # 返回最长子串及其长度 + + +# print(find_lcsubstr('香港疫情','疫情')) + + +def reduce_mem_usage(df, verbose=True): + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + + start_mem = df.memory_usage().sum() / 1024 ** 2 + + for col in df.columns: + + col_type = df[col].dtypes + + if col_type in numerics: + + c_min = df[col].min() + + c_max = df[col].max() + + if str(col_type)[:3] == 'int': + + if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: + + df[col] = df[col].astype(np.int8) + + elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: + + df[col] = df[col].astype(np.int16) + + elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: + + df[col] = df[col].astype(np.int32) + + elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: + + df[col] = df[col].astype(np.int64) + + else: + + if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: + + df[col] = df[col].astype(np.float16) + + elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: + + df[col] = df[col].astype(np.float32) + + else: + + df[col] = df[col].astype(np.float64) + + end_mem = df.memory_usage().sum() / 1024 ** 2 + + if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * ( + start_mem - end_mem) / start_mem)) + + return df diff --git a/gomate/modules/clusters/representations/bert.py b/gomate/modules/clusters/representations/bert.py new file mode 100644 index 0000000..9b7fef2 --- /dev/null +++ b/gomate/modules/clusters/representations/bert.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 13:47 +# description:"do something" + diff --git a/gomate/modules/clusters/representations/tfidf.py b/gomate/modules/clusters/representations/tfidf.py new file mode 100644 index 0000000..85a5853 --- /dev/null +++ b/gomate/modules/clusters/representations/tfidf.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# datetime:2021/5/11 16:26 +# description:"tfidf模型聚类与分类" +import logging +import os +import time + +import jieba +import pandas as pd +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import Normalizer + +jieba.setLogLevel(logging.INFO) + + +class TfidfVector(object): + def __init__(self): + pass + + def online_transform(self, + corpus=None, + max_features=88888, + n_components=1024, + is_svd=True, ngrams=(1, 1) + ): + """ + 实时训练模型并进行返回转换向量 + :param corpus: 分完词的用于训练tfidf模型的语料 + :param max_features: + :param ngrams: + :param is_svd: 是否进行降维 + :param n_components: + :param input_docs: 需要转化的语料 + :return: + """ + vec = TfidfVectorizer(max_features=max_features, ngram_range=ngrams, + min_df=2, max_df=0.96, + strip_accents='unicode', + norm='l2', + token_pattern=r"(?u)\b\w+\b") + + X = vec.fit_transform(corpus) # sparse matrix + if not is_svd: + X = X.toarray() + print("原始的tfidf矩阵大小X.shape", X.shape) + print("词汇量:", len(vec.vocabulary_)) # 词汇量: 76381 + print(vec.idf_) + # print(vec.vocabulary_) + if is_svd: + t0 = time.time() + svd = TruncatedSVD(n_components=n_components, random_state=42) + normalizer = Normalizer(copy=False) + lsa = make_pipeline(svd, normalizer, verbose=True) + X = lsa.fit_transform(X) + # print(X) + print("通过SVD降维之后的X.shape", X.shape) + print("done in %fs" % (time.time() - t0)) + explained_variance = svd.explained_variance_ratio_.sum() + print("Explained variance of the SVD step: {}%".format( + int(explained_variance * 100))) + + return X + + +if __name__ == '__main__': + date = '2021-05-10' + file_name = '/home/edit/newspaper/news_recommend/result/{}/result01_{}.xlsx'.format(date, date) + data = pd.read_excel(file_name) + + tv = TfidfVector() + + corpus_vectors = tv.online_transform(data['text'], is_svd=False, ngrams=(1, 1)) diff --git a/gomate/modules/clusters/singlepass.py b/gomate/modules/clusters/singlepass.py new file mode 100644 index 0000000..caee4ca --- /dev/null +++ b/gomate/modules/clusters/singlepass.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 14:32 +# description:"do something" +import collections +import os +from collections import Counter + +import jieba +import jieba.posseg +import numpy as np +import pandas as pd +from tqdm import tqdm + +from gomate.modules.clusters.corpus import documents +from gomate.modules.clusters.libraries.display import usual_print +from gomate.modules.clusters.representations.tfidf import TfidfVector + + +class SGCluster(object): + def __init__(self, + vector_path=None, + result_txt_file=None, + output_file=None, + threshold=0.3, + max_features=88888, + n_components=1024, ngrams=2): + self.vector_file = vector_path + self.result_path = result_txt_file + self.output_file = output_file + self.threshold = threshold + self.max_features = max_features + self.n_components = n_components + self.ngrams = ngrams + + def tokenize(self, text): + """ + 分词 + :param text: + :return: + """ + return " ".join([w for w in jieba.cut(text)]) + + def cosine(self, rep1, rep2): + """ + 修正后的余弦相似度 + :param rep1: numpy.array([*,*,*,...]) + :param rep2: numpy.array([*,*,*,...]) + :return: float + """ + assert rep1.shape == rep2.shape + cos = np.sum(np.multiply(rep1, rep2)) / (np.linalg.norm(rep1) * np.linalg.norm(rep2)) + return cos + + def get_max_similarity(self, dic_topic=None, vector=None, is_avg=False): + """ + 获取文档相似度最大的聚类中心 + :param dic_topic: + :param vector: + :param is_avg: + :return: + """ + max_index = -1 + max_value = 0 + for k, cluster in dic_topic.items(): + if is_avg: + # 利用当前簇所有文档与候选文档计算相似度,之后得到平均相似度 + one_similarity = np.mean([self.cosine(vector, v) for v in cluster]) + else: + # 利用当前簇的平均向量计算与候选文档的相似度 + mean_v = np.mean(cluster, axis=0) + one_similarity = self.cosine(mean_v, vector) + if one_similarity > max_value: + max_value = one_similarity + max_index = k + return max_index, max_value + + def get_keywords(slef, cluster_text): + """ + 获取聚类关键词 + :param cluster_text: + :return: + """ + sentence_seged = jieba.posseg.cut(cluster_text.strip()) + pos = ['Ag', 'an', 'i', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 'v', 'eng'] + keywords = [x.word for x in sentence_seged if x.flag in pos and len(x.word) > 1] + keywords = [word for word in keywords if not word.endswith('报')] + word_cnt = Counter(keywords).most_common(3) + keywords = [w[0] for w in word_cnt] + keywords = ",".join(keywords) + return keywords + + def generate_clusters(self, corpus_vectors, text2index, theta): + clusters = {} + cluster_text = {} + num_topic = 0 + + for vector, text in tqdm(zip(corpus_vectors, text2index), + total=len(corpus_vectors), + desc="single-pass clustering..."): + if num_topic == 0: # 选取第一个文档 + clusters.setdefault(num_topic, []).append(vector) + cluster_text.setdefault(num_topic, []).append(text) + num_topic += 1 + else: + max_index, max_value = self.get_max_similarity(clusters, vector, False) + if max_value > theta: + clusters[max_index].append(vector) + cluster_text[max_index].append(text) + else: # 创建一个新簇 + clusters.setdefault(num_topic, []).append(vector) + cluster_text.setdefault(num_topic, []).append(text) + num_topic += 1 + return clusters, cluster_text, + + def classify(self, data): + if 'text' not in data: + data['text'] = (data['title'].astype(str) + ' ' + data['content'].astype(str)).apply( + lambda x: self.tokenize(x)) + if not os.path.exists(self.vector_file): + tv = TfidfVector() + corpus_vectors = tv.online_transform(corpus=data['text'], max_features=self.max_features, + n_components=self.n_components, is_svd=True, ngrams=(1, self.ngrams)) + print(corpus_vectors) + print(corpus_vectors.shape) + np.save(self.vector_file, corpus_vectors) + else: + corpus_vectors = np.load(self.vector_file) + + index2ids = collections.OrderedDict() + index2corpus = collections.OrderedDict() + for index, line in data.iterrows(): + index2ids[index] = line['id'] + index2corpus[index] = line['title'] + print(len(set(index2ids.values()))) + text2index = list(index2corpus.keys()) + print('docs total size:{}'.format(len(text2index))) + + clusters, cluster_text = self.generate_clusters(corpus_vectors, + text2index, + theta=self.threshold) + print("." * 30) + print("得到的类数量有: {} 个 ...".format(len(clusters))) + print("." * 30) + + # 按聚类语句数量对聚类结果进行降序排列 + clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True) + csv_data = [] + artilceid_clusterid = {} + clusterid_keywords = {} + with open(self.result_path, 'w', encoding='utf-8') as file_write: + for k in clusterTopic_list: + cluster_text = [] + for index, value in enumerate(k[1], start=1): + cluster_text.append( + '(' + str(index) + '): ' + str(index2corpus[value]) + '\t' + str(index2ids[value])) + csv_data.append([k[0], len(k[1]), str(index2corpus[value]), str(index2ids[value])]) + artilceid_clusterid[str(index2ids[value])] = k[0] + cluster_text = '\n'.join(cluster_text) + keywords = self.get_keywords(cluster_text) + clusterid_keywords[k[0]] = keywords + file_write.write( + "【关键词】:{}\n【簇索引】:{} \n【簇中文档数】:{} \n【簇中文档】 :\n{} \n".format(keywords, k[0], len(k[1]), + cluster_text)) + file_write.write('\n') + file_write.flush() + print("len(artilceid_clusterid)", len(artilceid_clusterid)) + print("len(clusterid_keywords)", len(clusterid_keywords)) + print(data['id']) + # print(artilceid_clusterid) + # print(clusterid_keywords) + data['cluster_index'] = data['id'].map(artilceid_clusterid) + data['cluster_label'] = data['cluster_index'].map(clusterid_keywords) + + data['cluster_count'] = data.groupby(by='cluster_index')['id'].transform('count') + usual_print(self.output_file, "正在保存到") + # print(data) + save_cols = ['id', 'title', 'content', 'cluster_index', 'cluster_label', 'cluster_count'] + if self.output_file.endswith('xlsx'): + data[save_cols].to_excel(self.output_file, index=None) + elif self.output_file.endswith('csv'): + data[save_cols].to_csv(self.output_file, index=None) + elif self.output_file.endswith('json'): + with open(self.output_file, 'w', encoding='utf-8') as file: + data[save_cols].to_json(file, orient="records", + lines=True, + force_ascii=False) + else: + data[save_cols].to_excel(self.output_file, index=None) + + +if __name__ == '__main__': + # parser = argparse.ArgumentParser(description='请输入聚类程序参数') + # parser.add_argument('--vector_path', type=str, required=False,default="vector.npy", help='文档中间向量文件') + # parser.add_argument('--result_txt_file', type=str, required=False,default="result.txt", help='聚类结果记录文件') + # parser.add_argument('--output_file', type=str, required=False, default="result.xlsx",help='聚类结果输出文件 csv/xlsx/json/etc') + # parser.add_argument('--threshold', type=float, default=0.20, + # help='文档之间合并的相似度阈值,该值越大产生的聚类数量越多') + # parser.add_argument('--max_features', type=float, default=88888, + # help='Tfidf模型词汇表大小,可以根于输入的文档规模调整') + # parser.add_argument('--n_components', type=float, default=1024, help='文档向量维度') + # parser.add_argument('--ngrams', type=float, default=2, help='ngram大小,避免维度稀疏,建议1-3') + # args = parser.parse_args() + + # file_name = 'result01_2021-06-02_rules.xlsx' + # data = pd.read_excel('data-zh-main.xlsx', dtype={'id': str}) + # data = pd.DataFrame(documents, columns=['id', 'title', 'content']) + data = pd.read_excel('data.xlsx', dtype={'id': str}) + data=data.drop_duplicates(subset=['title']).reset_index(drop=True) + print(data.shape) + data['id'] = data['id'].astype(str) + # [id,title,content] + sc = SGCluster( + vector_path="vector.npy", + result_txt_file="result.txt", + output_file="result.xlsx", + threshold=0.4, + max_features=8888, + n_components=1024, + ngrams=2, + ) + sc.classify(data) diff --git a/gomate/modules/clusters/singlepass_v2.py b/gomate/modules/clusters/singlepass_v2.py new file mode 100644 index 0000000..fc85cc0 --- /dev/null +++ b/gomate/modules/clusters/singlepass_v2.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# author:quincy qiang +# email:yanqiangmiffy@gmail.com +# datetime:2021/5/25 14:32 +# description:"do something" +import collections +import os +from collections import Counter + +import jieba +import jieba.posseg +import numpy as np +import pandas as pd +from tqdm import tqdm + +from gomate.modules.clusters.libraries.display import usual_print +from gomate.modules.clusters.representations.tfidf import TfidfVector + + +class SGCluster(object): + def __init__(self, + vector_path=None, + result_txt_file=None, + output_file=None, + threshold=0.3, + max_features=88888, + n_components=1024, ngrams=2): + self.vector_file = vector_path + self.result_path = result_txt_file + self.output_file = output_file + self.threshold = threshold + self.max_features = max_features + self.n_components = n_components + self.ngrams = ngrams + + def tokenize(self, text): + """ + 分词 + :param text: + :return: + """ + return " ".join([w for w in jieba.cut(text)]) + + def cosine(self, rep1, rep2): + """ + 修正后的余弦相似度 + :param rep1: numpy.array([*,*,*,...]) + :param rep2: numpy.array([*,*,*,...]) + :return: float + """ + assert rep1.shape == rep2.shape + cos = np.sum(np.multiply(rep1, rep2)) / (np.linalg.norm(rep1) * np.linalg.norm(rep2)) + return cos + + def get_max_similarity(self, dic_topic=None, vector=None, is_avg=False): + """ + 获取文档相似度最大的聚类中心 + :param dic_topic: + :param vector: + :param is_avg: + :return: + """ + max_index = -1 + max_value = 0 + for k, cluster in dic_topic.items(): + if is_avg: + # 利用当前簇所有文档与候选文档计算相似度,之后得到平均相似度 + one_similarity = np.mean([self.cosine(vector, v) for v in cluster]) + else: + # 利用当前簇的平均向量计算与候选文档的相似度 + mean_v = np.mean(cluster, axis=0) + one_similarity = self.cosine(mean_v, vector) + if one_similarity > max_value: + max_value = one_similarity + max_index = k + return max_index, max_value + + def get_keywords(slef, cluster_text): + """ + 获取聚类关键词 + :param cluster_text: + :return: + """ + sentence_seged = jieba.posseg.cut(cluster_text.strip()) + pos = ['Ag', 'an', 'i', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 'v', 'eng'] + keywords = [x.word for x in sentence_seged if x.flag in pos and len(x.word) > 1] + keywords = [word for word in keywords if not word.endswith('报')] + word_cnt = Counter(keywords).most_common(3) + keywords = [w[0] for w in word_cnt] + keywords = ",".join(keywords) + return keywords + + def generate_clusters(self, corpus_vectors, text2index, theta): + clusters = {} + cluster_text = {} + num_topic = 0 + + for vector, text in tqdm(zip(corpus_vectors, text2index), + total=len(corpus_vectors), + desc="single-pass clustering..."): + if num_topic == 0: # 选取第一个文档 + clusters.setdefault(num_topic, []).append(vector) + cluster_text.setdefault(num_topic, []).append(text) + num_topic += 1 + else: + max_index, max_value = self.get_max_similarity(clusters, vector, False) + if max_value > theta: + clusters[max_index].append(vector) + cluster_text[max_index].append(text) + else: # 创建一个新簇 + clusters.setdefault(num_topic, []).append(vector) + cluster_text.setdefault(num_topic, []).append(text) + num_topic += 1 + return clusters, cluster_text, + + def classify(self, data): + if 'text' not in data: + data['text'] = (data['title'].astype(str) + ' ' + data['content'].astype(str)).apply( + lambda x: self.tokenize(x)) + if not os.path.exists(self.vector_file): + tv = TfidfVector() + corpus_vectors = tv.online_transform(corpus=data['text'], max_features=self.max_features, + n_components=self.n_components, is_svd=True, ngrams=(1, self.ngrams)) + print(corpus_vectors) + print(corpus_vectors.shape) + np.save(self.vector_file, corpus_vectors) + else: + corpus_vectors = np.load(self.vector_file) + + index2ids = collections.OrderedDict() + index2corpus = collections.OrderedDict() + for index, line in data.iterrows(): + index2ids[index] = line['id'] + index2corpus[index] = line['title'] + print(len(set(index2ids.values()))) + text2index = list(index2corpus.keys()) + print('docs total size:{}'.format(len(text2index))) + + clusters, cluster_text = self.generate_clusters(corpus_vectors, + text2index, + theta=self.threshold) + print("." * 30) + print("得到的类数量有: {} 个 ...".format(len(clusters))) + print("." * 30) + + # 按聚类语句数量对聚类结果进行降序排列 + clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True) + csv_data = [] + artilceid_clusterid = {} + clusterid_keywords = {} + with open(self.result_path, 'w', encoding='utf-8') as file_write: + for k in clusterTopic_list: + cluster_text = [] + for index, value in enumerate(k[1], start=1): + cluster_text.append( + '(' + str(index) + '): ' + str(index2corpus[value]) + '\t' + str(index2ids[value])) + csv_data.append([k[0], len(k[1]), str(index2corpus[value]), str(index2ids[value])]) + artilceid_clusterid[str(index2ids[value])] = k[0] + cluster_text = '\n'.join(cluster_text) + keywords = self.get_keywords(cluster_text) + clusterid_keywords[k[0]] = keywords + file_write.write( + "【关键词】:{}\n【簇索引】:{} \n【簇中文档数】:{} \n【簇中文档】 :\n{} \n".format(keywords, k[0], len(k[1]), + cluster_text)) + file_write.write('\n') + file_write.flush() + print("len(artilceid_clusterid)", len(artilceid_clusterid)) + print("len(clusterid_keywords)", len(clusterid_keywords)) + print(data['id']) + # print(artilceid_clusterid) + # print(clusterid_keywords) + data['cluster_level1_index'] = data['cluster_index'] + data['cluster_level2_index'] = data['id'].map(artilceid_clusterid) + data['cluster_label'] = data['cluster_index'].map(clusterid_keywords) + + data['cluster_count'] = data.groupby(by='cluster_index')['id'].transform('count') + usual_print(self.output_file, "正在保存到") + # print(data) + save_cols = ['id', 'title', 'content', 'cluster_level1_index','cluster_level2_index', 'cluster_label', 'cluster_count'] + if self.output_file.endswith('xlsx'): + data[save_cols].to_excel(self.output_file, index=None) + elif self.output_file.endswith('csv'): + data[save_cols].to_csv(self.output_file, index=None) + elif self.output_file.endswith('json'): + with open(self.output_file, 'w', encoding='utf-8') as file: + data[save_cols].to_json(file, orient="records", + lines=True, + force_ascii=False) + else: + data[save_cols].to_excel(self.output_file, index=None) + + +if __name__ == '__main__': + + data = pd.read_excel('result.xlsx', dtype={'id': str}) + data = data.drop_duplicates(subset=['title']).reset_index(drop=True) + print(data.shape) + data['id'] = data['id'].astype(str) + + for cluster_index, group in data.groupby(by="cluster_index"): + if len(group) > 4: + group = group.reset_index(drop=True) + # # [id,title,content] + sc = SGCluster( + vector_path=f"level2/vector_{cluster_index}.npy", + result_txt_file=f"level2/result_{cluster_index}.txt", + output_file=f"level2/result_{cluster_index}.xlsx", + threshold=0.5, + max_features=8888, + n_components=64, + ngrams=2, + ) + sc.classify(group)