update dataset.

shibing624 · Mar 1, 2022 · 5b36832 · 5b36832
1 parent 8a09213
commit 5b36832
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 15,443 deletions.
diff --git a/autocomplete/create_dataset.py b/autocomplete/create_dataset.py
@@ -4,6 +4,7 @@
 @description: Parse all files and write to a single file
 refer: https://github.com/labmlai/python_autocomplete/blob/master/python_autocomplete/create_dataset.py
 """
+import os
 import glob
 import re
 import string
@@ -13,50 +14,46 @@
 from pathlib import Path
 from typing import List, Set
 from typing import Optional
-import numpy as np
 from loguru import logger
 import ssl
 
 ssl._create_default_https_context = ssl._create_unverified_context
 PRINTABLE = set(string.printable)
 
 
-def get_python_files(dir_path: str = "download/source"):
-    """
-    Get list of python files and their paths inside `data/source` folder
-    """
-    files = glob.glob(f"{dir_path}/**/*.py", recursive=True)
-
-    return files
-
-
 def read_file(path: str) -> str:
     """
     Read a file
     """
     with open(path, 'r', encoding='utf8') as f:
-        content = f.read()
+        try:
+            content = f.read()
+        except UnicodeDecodeError as e:
+            logger.warning(f"UnicodeDecodeError: {path}, file pass")
+            content = ""
     content = ''.join(filter(lambda x: x in PRINTABLE, content))
 
     return content
 
 
-def concat_and_save(path: str, source_files: List[str]):
-    with open(path, 'w', encoding='utf8') as f:
-        for i, source in enumerate(source_files):
-            f.write(read_file(source) + "\n")
+def save_file(content, file_path: str):
+    with open(file_path, 'w', encoding='utf8') as f:
+        f.write(str(content))
+
 
+def merge_and_save(source_files, path):
+    with open(path, 'w', encoding='utf8') as f:
+        for src in source_files:
+            f.write(read_file(src) + "\n\n")
 
-def get_repos_from_readme(file_path='download/pytorch_awesome.md'):
-    with open(str(file_path), 'r') as f:
-        content = f.read()
 
+def get_repos_from_readme(readme_content):
     link_pattern = re.compile(r"""
         \[(?P<title>[^\]]*)\] # title
         \((?P<utl>[^\)]*)\) # url
     """, re.VERBOSE)
 
-    res = link_pattern.findall(content)
+    res = link_pattern.findall(readme_content)
 
     github_repos = []
     repo_pattern = re.compile(r'https://github.com/(?P<user>[^/]*)/(?P<repo>[^/#]*)$')
@@ -68,16 +65,8 @@ def get_repos_from_readme(file_path='download/pytorch_awesome.md'):
     return github_repos
 
 
-def get_awesome_pytorch_readme(file_path='download/pytorch_awesome.md'):
-    md = urllib.request.urlopen('https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md')
-    content = md.read()
-
-    with open(file_path, 'w', encoding='utf8') as f:
-        f.write(str(content))
-
-
-def download_repo(org: str, repo: str, idx: Optional[int]):
-    zip_file = Path(f'download/{org}_{repo}.zip')
+def download_repo(save_dir: str, org: str, repo: str, idx: Optional[int]):
+    zip_file = Path(f'{save_dir}/{org}_{repo}.zip')
 
     if zip_file.exists():
         return zip_file
@@ -103,17 +92,12 @@ def download_repo(org: str, repo: str, idx: Optional[int]):
     return zip_file
 
 
-def create_folders():
-    source = Path('download/source')
-    if not source.exists():
-        source.mkdir(parents=True)
-
-
-def extract_zip(file_path: Path):
-    source = Path('download/source')
+def extract_zip(source_dir, file_path: Path):
+    source = Path(source_dir)
     logger.debug(f"Extract {file_path}")
     repo_source = source / file_path.stem
     if repo_source.exists():
+        logger.debug(f"Exists: {repo_source}")
         return repo_source
     try:
         with zipfile.ZipFile(file_path, 'r') as repo_zip:
@@ -139,42 +123,75 @@ def remove_files(path: Path, keep: Set[str]):
                 p.unlink()
 
 
-def progressive(limit_size=None):
-    logger.debug('Get pytorch_awesome')
-    # Get repos
-    get_awesome_pytorch_readme()
-    repos = get_repos_from_readme()
-    if limit_size:
-        repos = repos[:limit_size]
-    # Download zips
-    # Download TheAlgorithms Python repo
-    zip_file = download_repo("TheAlgorithms", "Python", 0)
-    extract_zip(zip_file)
-    # Download other repos
-    for i, r in enumerate(repos):
-        zip_file = download_repo(r[0], r[1], i)
-        if not zip_file:
-            continue
-        extracted = extract_zip(zip_file)
-        remove_files(extracted, keep={'.py'})
-
-
-def main():
-    create_folders()
-    try:
-        progressive(limit_size=10)
-    except KeyboardInterrupt:
-        pass
-    source_files = get_python_files()
-    logger.debug(f'Source_files size: {len(source_files)}')
-    np.random.shuffle(source_files)
-    train_valid_split = int(len(source_files) * 0.9)
-    train_file = 'download/train.txt'
-    valid_file = 'download/valid.txt'
-    concat_and_save(train_file, source_files[:train_valid_split])
-    concat_and_save(valid_file, source_files[train_valid_split:])
-    logger.info(f'Save train file: {train_file}, valid file: {valid_file}')
-
-
-if __name__ == '__main__':
-    main()
+def get_source_code_by_language(code_languages=("python", "java", "cpp"),
+                                save_dir='download/',
+                                each_limit_repos=3):
+    sources = dict()
+    Path(save_dir).mkdir(parents=True, exist_ok=True)
+    if isinstance(code_languages, str):
+        code_languages = [code_languages]
+    logger.info(f"Get source code by language: {code_languages}")
+
+    def get_source_files_by_readme(readme_content, sub_save_dir, limit_size):
+        zip_dir = sub_save_dir + "/zip"
+        src_dir = sub_save_dir + "/src"
+        Path(zip_dir).mkdir(parents=True, exist_ok=True)
+        Path(src_dir).mkdir(parents=True, exist_ok=True)
+        repos = get_repos_from_readme(readme_content)
+        if limit_size:
+            repos = repos[:limit_size]
+        # Download repos
+        for i, r in enumerate(repos):
+            zip_file = download_repo(zip_dir, r[0], r[1], i)
+            if not zip_file:
+                continue
+            extracted = extract_zip(src_dir, zip_file)
+            remove_files(extracted, keep={suffix})
+        source_files = glob.glob(f"{src_dir}/**/*{suffix}", recursive=True)
+        logger.info(f"Path: {src_dir}/**/*{suffix}, file size: {len(source_files)}")
+        return source_files
+
+    if "python" in code_languages:
+        logger.debug('Get awesome-python')
+        suffix = '.py'
+        sub_save_dir = os.path.join(save_dir, 'python')
+        readme_file = sub_save_dir + '/README.md'
+        if os.path.exists(readme_file):
+            readme_content = read_file(readme_file)
+        else:
+            readme_content = urllib.request.urlopen(
+                'https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md').read()
+            readme_content = str(readme_content)
+            save_file(readme_content, readme_file)
+        sources['python'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
+        logger.info(f"Get source code by language: python done")
+    if "java" in code_languages:
+        logger.debug('Get awesome-java')
+        suffix = '.java'
+        sub_save_dir = os.path.join(save_dir, 'java')
+        readme_file = sub_save_dir + '/README.md'
+        if os.path.exists(readme_file):
+            readme_content = read_file(readme_file)
+        else:
+            readme_content = urllib.request.urlopen(
+                'https://raw.githubusercontent.com/akullpp/awesome-java/master/README.md').read()
+            readme_content = str(readme_content)
+            save_file(readme_content, readme_file)
+        sources['java'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
+        logger.info(f"Get source code by language: java done")
+    if "cpp" in code_languages:
+        logger.debug('Get awesome-cpp')
+        suffix = '.cpp'
+        sub_save_dir = os.path.join(save_dir, 'cpp')
+        readme_file = sub_save_dir + '/README.md'
+        if os.path.exists(readme_file):
+            readme_content = read_file(readme_file)
+        else:
+            readme_content = urllib.request.urlopen(
+                'https://raw.githubusercontent.com/fffaraz/awesome-cpp/master/README.md').read()
+            readme_content = str(readme_content)
+            save_file(readme_content, readme_file)
+        sources['cpp'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
+        logger.info(f"Get source code by language: cpp done")
+
+    return sources
diff --git a/autocomplete/upload_dataset.py b/autocomplete/upload_dataset.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+@author:XuMing(xuming624@qq.com)
+@description:
+"""
+
+"""Code AutoComplete Python dataset Corpus.(code_autocomplete)"""
+
+import os
+
+import datasets
+
+_DESCRIPTION = """纯文本数据，内容：高质量编程源代码，包括Python，Java，CPP源代码"""
+
+PYTHON_HOME = "https://github.com/bharathgs/Awesome-pytorch-list"
+JAVA_HOME = "https://github.com/akullpp/awesome-java"
+CPP_HOME = "https://github.com/fffaraz/awesome-cpp"
+
+_CITATION = "https://github.com/shibing624/code-autocomplete"
+
+_DATA_URL = "https://huggingface.co/datasets/shibing624/python-source-code/resolve/main/download.zip"
+
+
+class SourceCodeConfig(datasets.BuilderConfig):
+    """BuilderConfig for NLI_zh"""
+
+    def __init__(self, features, data_url, citation, url, **kwargs):
+        """BuilderConfig for NLI_zh
+        Args:
+          features: `list[string]`, list of the features that will appear in the
+            feature dict. Should not include "label".
+          data_url: `string`, url to download the zip file from.
+          citation: `string`, citation for the data set.
+          url: `string`, url for information about the data set.
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+        self.features = features
+        self.data_url = data_url
+        self.citation = citation
+        self.url = url
+
+
+class SourceCode(datasets.GeneratorBasedBuilder):
+    """The Natural Language Inference Chinese(NLI_zh) Corpus."""
+
+    BUILDER_CONFIGS = [
+        SourceCodeConfig(
+            name="python",
+            description=_DESCRIPTION,
+            features=["code"],
+            data_url=_DATA_URL,
+            citation=_CITATION,
+            url=PYTHON_HOME,
+        ),
+        SourceCodeConfig(
+            name="java",
+            description=_DESCRIPTION,
+            features=["code"],
+            data_url=_DATA_URL,
+            citation=_CITATION,
+            url=JAVA_HOME,
+        ),
+        SourceCodeConfig(
+            name="cpp",
+            description=_DESCRIPTION,
+            features=["code"],
+            data_url=_DATA_URL,
+            citation=_CITATION,
+            url=CPP_HOME,
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=self.config.description,
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                }
+            ),
+            homepage=self.config.url,
+            citation=self.config.citation,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(self.config.data_url) or ""
+        dl_dir = os.path.join(dl_dir, f"download/{self.config.name}")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, f"train.txt"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, f"valid.txt"),
+                },
+            ),
+            # datasets.SplitGenerator(
+            #     name=datasets.Split.TEST,
+            #     gen_kwargs={
+            #         "filepath": os.path.join(dl_dir, f"test.txt"),
+            #     },
+            # ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        with open(filepath, 'r', encoding="utf-8") as f:
+            for idx, row in enumerate(f):
+                if row.strip():
+                    yield idx, {"text": row}
+                else:
+                    yield idx, {"text": ""}