Created
March 17, 2023 13:35
-
-
Save grosa1/c39e176252a9af0a10b75ccd88226d86 to your computer and use it in GitHub Desktop.
repo_cloner.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import logging | |
| from shutil import rmtree | |
| import json | |
| import pandas as pd | |
| import git | |
| import traceback | |
| from pathlib import Path | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s') | |
| REPOS_CSV = "filterby_star/git_commit.csv" | |
| REPOS_DIR = "cloned" | |
| BASE_DIR = os.getcwd() | |
| def is_git_repo(path): | |
| try: | |
| _ = git.Repo(path).git_dir | |
| return True | |
| except git.exc.InvalidGitRepositoryError: | |
| return False | |
| def shallow_clone_commit(repo_name, commit_hash, repo_dir): | |
| if repo_name.startswith("bitbucket.org/") or repo_name.startswith("gitlab.com/"): | |
| clone_dir = os.path.join(REPOS_DIR, repo_name.split("/", 1)[1].replace("_", "/", 1)) | |
| repo_name = repo_name.replace("_", "/", 1) | |
| repo_url = "https://test:test@{}.git".format(repo_name) | |
| else: | |
| repo_url = "https://test:[email protected]/{}.git".format(repo_name) | |
| clone_dir = os.path.join(REPOS_DIR, repo_name) | |
| #clone_cmd = "mkdir -p {} &&" + \ | |
| # "cd {} && " + \ | |
| # "git init && " + \ | |
| # "git remote add origin {} && " + \ | |
| # "git fetch --depth 1 origin {} && " + \ | |
| # "git checkout FETCH_HEAD" | |
| #exit_code = os.system(clone_cmd.format(repo_dir, repo_dir, repo_url, commit_hash)) | |
| Path(clone_dir).mkdir(parents=True, exist_ok=True) | |
| logging.info(f">cloning {repo_url} to {clone_dir}") | |
| git.Repo.clone_from(url=repo_url, to_path=clone_dir, depth=1) | |
| if __name__ == "__main__" | |
| for i, row in pd.read_csv(REPOS_CSV).iterrows(): | |
| try: | |
| repo_full_name = row["projectid"].replace("_", "/", 1) | |
| commit_sha = "" | |
| repo_name = repo_full_name.split("/") | |
| repo_dir = os.path.join(REPOS_DIR, *repo_name) | |
| #if os.path.isdir(repo_dir) and is_git_repo(repo_dir): | |
| # logging.info("Repository already present: {}".format(repo_full_name)) | |
| # continue | |
| try: | |
| logging.info(">processing %s", repo_full_name) | |
| shallow_clone_commit(repo_full_name, commit_sha, repo_dir) | |
| logging.info("+done: " + repo_full_name) | |
| except: | |
| logging.error(f"{repo_full_name} {commit_sha} failed - {traceback.format_exc()}") | |
| # rmtree(repo_dir, ignore_errors=True) | |
| except: | |
| logging.error(f"{repo_full_name} {commit_sha} failed - {traceback.format_exc()}") | |
| logging.info("+++ Finished +++") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment