Skip to content

Instantly share code, notes, and snippets.

@grosa1
Created March 17, 2023 13:35
Show Gist options
  • Save grosa1/c39e176252a9af0a10b75ccd88226d86 to your computer and use it in GitHub Desktop.
Save grosa1/c39e176252a9af0a10b75ccd88226d86 to your computer and use it in GitHub Desktop.
repo_cloner.py
import os
import logging
from shutil import rmtree
import json
import pandas as pd
import git
import traceback
from pathlib import Path
logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s')
REPOS_CSV = "filterby_star/git_commit.csv"
REPOS_DIR = "cloned"
BASE_DIR = os.getcwd()
def is_git_repo(path):
try:
_ = git.Repo(path).git_dir
return True
except git.exc.InvalidGitRepositoryError:
return False
def shallow_clone_commit(repo_name, commit_hash, repo_dir):
if repo_name.startswith("bitbucket.org/") or repo_name.startswith("gitlab.com/"):
clone_dir = os.path.join(REPOS_DIR, repo_name.split("/", 1)[1].replace("_", "/", 1))
repo_name = repo_name.replace("_", "/", 1)
repo_url = "https://test:test@{}.git".format(repo_name)
else:
repo_url = "https://test:[email protected]/{}.git".format(repo_name)
clone_dir = os.path.join(REPOS_DIR, repo_name)
#clone_cmd = "mkdir -p {} &&" + \
# "cd {} && " + \
# "git init && " + \
# "git remote add origin {} && " + \
# "git fetch --depth 1 origin {} && " + \
# "git checkout FETCH_HEAD"
#exit_code = os.system(clone_cmd.format(repo_dir, repo_dir, repo_url, commit_hash))
Path(clone_dir).mkdir(parents=True, exist_ok=True)
logging.info(f">cloning {repo_url} to {clone_dir}")
git.Repo.clone_from(url=repo_url, to_path=clone_dir, depth=1)
if __name__ == "__main__"
for i, row in pd.read_csv(REPOS_CSV).iterrows():
try:
repo_full_name = row["projectid"].replace("_", "/", 1)
commit_sha = ""
repo_name = repo_full_name.split("/")
repo_dir = os.path.join(REPOS_DIR, *repo_name)
#if os.path.isdir(repo_dir) and is_git_repo(repo_dir):
# logging.info("Repository already present: {}".format(repo_full_name))
# continue
try:
logging.info(">processing %s", repo_full_name)
shallow_clone_commit(repo_full_name, commit_sha, repo_dir)
logging.info("+done: " + repo_full_name)
except:
logging.error(f"{repo_full_name} {commit_sha} failed - {traceback.format_exc()}")
# rmtree(repo_dir, ignore_errors=True)
except:
logging.error(f"{repo_full_name} {commit_sha} failed - {traceback.format_exc()}")
logging.info("+++ Finished +++")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment