Skip to content

Commit

Permalink
update dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
shibing624 committed Mar 1, 2022
1 parent 8a09213 commit 5b36832
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 15,443 deletions.
169 changes: 93 additions & 76 deletions autocomplete/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@description: Parse all files and write to a single file
refer: https://github.com/labmlai/python_autocomplete/blob/master/python_autocomplete/create_dataset.py
"""
import os
import glob
import re
import string
Expand All @@ -13,50 +14,46 @@
from pathlib import Path
from typing import List, Set
from typing import Optional
import numpy as np
from loguru import logger
import ssl

ssl._create_default_https_context = ssl._create_unverified_context
PRINTABLE = set(string.printable)


def get_python_files(dir_path: str = "download/source"):
"""
Get list of python files and their paths inside `data/source` folder
"""
files = glob.glob(f"{dir_path}/**/*.py", recursive=True)

return files


def read_file(path: str) -> str:
"""
Read a file
"""
with open(path, 'r', encoding='utf8') as f:
content = f.read()
try:
content = f.read()
except UnicodeDecodeError as e:
logger.warning(f"UnicodeDecodeError: {path}, file pass")
content = ""
content = ''.join(filter(lambda x: x in PRINTABLE, content))

return content


def concat_and_save(path: str, source_files: List[str]):
with open(path, 'w', encoding='utf8') as f:
for i, source in enumerate(source_files):
f.write(read_file(source) + "\n")
def save_file(content, file_path: str):
with open(file_path, 'w', encoding='utf8') as f:
f.write(str(content))


def merge_and_save(source_files, path):
with open(path, 'w', encoding='utf8') as f:
for src in source_files:
f.write(read_file(src) + "\n\n")

def get_repos_from_readme(file_path='download/pytorch_awesome.md'):
with open(str(file_path), 'r') as f:
content = f.read()

def get_repos_from_readme(readme_content):
link_pattern = re.compile(r"""
\[(?P<title>[^\]]*)\] # title
\((?P<utl>[^\)]*)\) # url
""", re.VERBOSE)

res = link_pattern.findall(content)
res = link_pattern.findall(readme_content)

github_repos = []
repo_pattern = re.compile(r'https://github.com/(?P<user>[^/]*)/(?P<repo>[^/#]*)$')
Expand All @@ -68,16 +65,8 @@ def get_repos_from_readme(file_path='download/pytorch_awesome.md'):
return github_repos


def get_awesome_pytorch_readme(file_path='download/pytorch_awesome.md'):
md = urllib.request.urlopen('https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md')
content = md.read()

with open(file_path, 'w', encoding='utf8') as f:
f.write(str(content))


def download_repo(org: str, repo: str, idx: Optional[int]):
zip_file = Path(f'download/{org}_{repo}.zip')
def download_repo(save_dir: str, org: str, repo: str, idx: Optional[int]):
zip_file = Path(f'{save_dir}/{org}_{repo}.zip')

if zip_file.exists():
return zip_file
Expand All @@ -103,17 +92,12 @@ def download_repo(org: str, repo: str, idx: Optional[int]):
return zip_file


def create_folders():
source = Path('download/source')
if not source.exists():
source.mkdir(parents=True)


def extract_zip(file_path: Path):
source = Path('download/source')
def extract_zip(source_dir, file_path: Path):
source = Path(source_dir)
logger.debug(f"Extract {file_path}")
repo_source = source / file_path.stem
if repo_source.exists():
logger.debug(f"Exists: {repo_source}")
return repo_source
try:
with zipfile.ZipFile(file_path, 'r') as repo_zip:
Expand All @@ -139,42 +123,75 @@ def remove_files(path: Path, keep: Set[str]):
p.unlink()


def progressive(limit_size=None):
logger.debug('Get pytorch_awesome')
# Get repos
get_awesome_pytorch_readme()
repos = get_repos_from_readme()
if limit_size:
repos = repos[:limit_size]
# Download zips
# Download TheAlgorithms Python repo
zip_file = download_repo("TheAlgorithms", "Python", 0)
extract_zip(zip_file)
# Download other repos
for i, r in enumerate(repos):
zip_file = download_repo(r[0], r[1], i)
if not zip_file:
continue
extracted = extract_zip(zip_file)
remove_files(extracted, keep={'.py'})


def main():
create_folders()
try:
progressive(limit_size=10)
except KeyboardInterrupt:
pass
source_files = get_python_files()
logger.debug(f'Source_files size: {len(source_files)}')
np.random.shuffle(source_files)
train_valid_split = int(len(source_files) * 0.9)
train_file = 'download/train.txt'
valid_file = 'download/valid.txt'
concat_and_save(train_file, source_files[:train_valid_split])
concat_and_save(valid_file, source_files[train_valid_split:])
logger.info(f'Save train file: {train_file}, valid file: {valid_file}')


if __name__ == '__main__':
main()
def get_source_code_by_language(code_languages=("python", "java", "cpp"),
save_dir='download/',
each_limit_repos=3):
sources = dict()
Path(save_dir).mkdir(parents=True, exist_ok=True)
if isinstance(code_languages, str):
code_languages = [code_languages]
logger.info(f"Get source code by language: {code_languages}")

def get_source_files_by_readme(readme_content, sub_save_dir, limit_size):
zip_dir = sub_save_dir + "/zip"
src_dir = sub_save_dir + "/src"
Path(zip_dir).mkdir(parents=True, exist_ok=True)
Path(src_dir).mkdir(parents=True, exist_ok=True)
repos = get_repos_from_readme(readme_content)
if limit_size:
repos = repos[:limit_size]
# Download repos
for i, r in enumerate(repos):
zip_file = download_repo(zip_dir, r[0], r[1], i)
if not zip_file:
continue
extracted = extract_zip(src_dir, zip_file)
remove_files(extracted, keep={suffix})
source_files = glob.glob(f"{src_dir}/**/*{suffix}", recursive=True)
logger.info(f"Path: {src_dir}/**/*{suffix}, file size: {len(source_files)}")
return source_files

if "python" in code_languages:
logger.debug('Get awesome-python')
suffix = '.py'
sub_save_dir = os.path.join(save_dir, 'python')
readme_file = sub_save_dir + '/README.md'
if os.path.exists(readme_file):
readme_content = read_file(readme_file)
else:
readme_content = urllib.request.urlopen(
'https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md').read()
readme_content = str(readme_content)
save_file(readme_content, readme_file)
sources['python'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
logger.info(f"Get source code by language: python done")
if "java" in code_languages:
logger.debug('Get awesome-java')
suffix = '.java'
sub_save_dir = os.path.join(save_dir, 'java')
readme_file = sub_save_dir + '/README.md'
if os.path.exists(readme_file):
readme_content = read_file(readme_file)
else:
readme_content = urllib.request.urlopen(
'https://raw.githubusercontent.com/akullpp/awesome-java/master/README.md').read()
readme_content = str(readme_content)
save_file(readme_content, readme_file)
sources['java'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
logger.info(f"Get source code by language: java done")
if "cpp" in code_languages:
logger.debug('Get awesome-cpp')
suffix = '.cpp'
sub_save_dir = os.path.join(save_dir, 'cpp')
readme_file = sub_save_dir + '/README.md'
if os.path.exists(readme_file):
readme_content = read_file(readme_file)
else:
readme_content = urllib.request.urlopen(
'https://raw.githubusercontent.com/fffaraz/awesome-cpp/master/README.md').read()
readme_content = str(readme_content)
save_file(readme_content, readme_file)
sources['cpp'] = get_source_files_by_readme(readme_content, sub_save_dir, each_limit_repos)
logger.info(f"Get source code by language: cpp done")

return sources
117 changes: 117 additions & 0 deletions autocomplete/upload_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""

"""Code AutoComplete Python dataset Corpus.(code_autocomplete)"""

import os

import datasets

_DESCRIPTION = """纯文本数据,内容:高质量编程源代码,包括Python,Java,CPP源代码"""

PYTHON_HOME = "https://github.com/bharathgs/Awesome-pytorch-list"
JAVA_HOME = "https://github.com/akullpp/awesome-java"
CPP_HOME = "https://github.com/fffaraz/awesome-cpp"

_CITATION = "https://github.com/shibing624/code-autocomplete"

_DATA_URL = "https://huggingface.co/datasets/shibing624/python-source-code/resolve/main/download.zip"


class SourceCodeConfig(datasets.BuilderConfig):
"""BuilderConfig for NLI_zh"""

def __init__(self, features, data_url, citation, url, **kwargs):
"""BuilderConfig for NLI_zh
Args:
features: `list[string]`, list of the features that will appear in the
feature dict. Should not include "label".
data_url: `string`, url to download the zip file from.
citation: `string`, citation for the data set.
url: `string`, url for information about the data set.
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
self.features = features
self.data_url = data_url
self.citation = citation
self.url = url


class SourceCode(datasets.GeneratorBasedBuilder):
"""The Natural Language Inference Chinese(NLI_zh) Corpus."""

BUILDER_CONFIGS = [
SourceCodeConfig(
name="python",
description=_DESCRIPTION,
features=["code"],
data_url=_DATA_URL,
citation=_CITATION,
url=PYTHON_HOME,
),
SourceCodeConfig(
name="java",
description=_DESCRIPTION,
features=["code"],
data_url=_DATA_URL,
citation=_CITATION,
url=JAVA_HOME,
),
SourceCodeConfig(
name="cpp",
description=_DESCRIPTION,
features=["code"],
data_url=_DATA_URL,
citation=_CITATION,
url=CPP_HOME,
),
]

def _info(self):
return datasets.DatasetInfo(
description=self.config.description,
features=datasets.Features(
{
"text": datasets.Value("string"),
}
),
homepage=self.config.url,
citation=self.config.citation,
)

def _split_generators(self, dl_manager):
dl_dir = dl_manager.download_and_extract(self.config.data_url) or ""
dl_dir = os.path.join(dl_dir, f"download/{self.config.name}")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(dl_dir, f"train.txt"),
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(dl_dir, f"valid.txt"),
},
),
# datasets.SplitGenerator(
# name=datasets.Split.TEST,
# gen_kwargs={
# "filepath": os.path.join(dl_dir, f"test.txt"),
# },
# ),
]

def _generate_examples(self, filepath):
"""This function returns the examples in the raw (text) form."""
with open(filepath, 'r', encoding="utf-8") as f:
for idx, row in enumerate(f):
if row.strip():
yield idx, {"text": row}
else:
yield idx, {"text": ""}
Loading

0 comments on commit 5b36832

Please sign in to comment.