Skip to content

Instantly share code, notes, and snippets.

@Eduardodudu
Last active May 25, 2022 16:04
Show Gist options
  • Save Eduardodudu/d16a07977689370769ea650cf1d16691 to your computer and use it in GitHub Desktop.
Save Eduardodudu/d16a07977689370769ea650cf1d16691 to your computer and use it in GitHub Desktop.
[Bibliometrix Adjustment] allowing for bibtex files #bibtex #bibliometrix
# Running
library(bibliometrix)
source(file = "./Functions_Bibliometrix.R") #Bypass Bibliometrix
# reading bibtex file
data <- readFiles("data.bib")
# Converting the loaded files into a R bibliographic dataframe
M <- convert2df(data, dbsource = "isi", format = "bibtex")
#Now you're all set to use bibliometrix functionalities
# Functions
readFiles <- function(...){
arguments <- unlist(list(...))
k=length(arguments)
D=list()
enc="UTF-8"
origEnc=getOption("encoding")
if (origEnc=="UTF-8"){options(encoding = "native.enc")}
for (i in 1:k){
D[[i]]=suppressWarnings(
iconv(readLines(arguments[i],encoding = "UTF-8"),"latin1", "ASCII", sub="")
#conv(readLines(arguments[[i]]))
)
}
D=unlist(D)
options(encoding = origEnc)
return(D)
}
convert2df<-function(file,dbsource="wos",format="plaintext"){
cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
if (length(setdiff(dbsource,c("isi","wos","scopus","pubmed","cochrane","generic")))>0){
cat("\n 'dbsource' argument is not properly specified")
cat("\n 'dbsource' argument has to be a character string matching 'isi, 'wos', 'scopus', 'generic', or 'pubmed'.\n")}
if (length(setdiff(format,c("plaintext","bibtex","pubmed","cochrane")))>0){
cat("\n 'format' argument is not properly specified")
cat("\n 'format' argument has to be a character string matching 'plaintext or 'bibtex'.\n")}
if (length(setdiff(format,c("plaintext","bibtex")))>0){
file=iconv(file, "latin1", "ASCII", sub="")}
if (dbsource=="wos") dbsource="isi"
switch(dbsource,
isi={
switch(format,
bibtex={M=bib2df(file,dbsource="isi")},
plaintext={M=isi2df(file)}
)},
scopus={M=bib2df(file,dbsource="scopus")
},
generic={M=bib2df(file,dbsource="generic")
},
pubmed={M=pubmed2df(file)
},
cochrane={M=cochrane2df(file)
}
)
if ("PY" %in% names(M)){M$PY=as.numeric(M$PY)} else {M$PY=NA}
if ("TC" %in% names(M)){M$TC=as.numeric(M$TC)} else {M$TC=NA}
if (!("CR" %in% names(M))){M$CR="none"}
if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
cat("Done!\n\n")
## AU_UN field creation
if ("C1" %in% names(M)){
cat("\nGenerating affiliation field tag AU_UN from C1: ")
M <- metaTagExtraction(M, Field="AU_UN")
cat("Done!\n\n")
} else{
M$C1=NA
M$AU_UN=NA}
### SR field creation
suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
## AU normalization
M$AU=unlist(lapply(strsplit(M$AU,";"), function(x){
x=trimws(trimES(gsub("[[:punct:]]"," ",x)))
x=paste(x,collapse=";")
}))
### identify duplicated SRs
SR=M$SR
tab=table(SR)
tab2=table(tab)
ind=as.numeric(names(tab2))
ind=ind[which(ind>1)]
if (length(ind)>0){
for (i in ind){
indice=names(which(tab==i))
for (j in indice){
indice2=which(SR==j)
SR[indice2]=paste(SR[indice2],as.character(1:length(indice2)),sep=" ")
}
}
}
row.names(M) <- SR
return(M)
}
bib2df<-function(D, dbsource="isi"){
bibtag <- NULL
data("bibtag",envir=environment())
bibtag=as.data.frame(bibtag)
txt <- preprocessing(D)
D=txt$D
DD=txt$DD
Tag=txt$Tag
switch(dbsource,
isi={
ind=which(bibtag[,"ISI"] %in% Tag)
bibtag2=bibtag[ind,c("TAG","ISI")]},
scopus={
ind=which(bibtag[,"SCOPUS"] %in% Tag)
bibtag2=bibtag[ind,c("TAG","SCOPUS")]
},
generic={
ind=which(bibtag[,"GENERIC"] %in% Tag)
bibtag2=bibtag[ind,c("TAG","GENERIC")]
})
uniqueTag=bibtag2$TAG
Tag=gsub("\\{","",bibtag2[,2])
# first row of each document
Papers <- which(regexpr("manuscript=",D)==1)
Papers <- c(Papers,length(D))
# number of documents
nP <- length(Papers)-1
DATA <- data.frame(matrix(NA,nP,length(uniqueTag)))
names(DATA) <- uniqueTag
for (i in 1:nP){
if (!is.null(shiny::getDefaultReactiveDomain())){shiny::incProgress(1/nP)}
#print(i)
iP <- Papers[i]
iPs <- Papers[i+1]-1
if (i%%100==0 | i==nP) cat("Articles extracted ",i,"\n")
iPiPs <- seq(iP,iPs)
for (j in 1:length(Tag)){
#print(Tag[j])
POS <- which(regexpr(Tag[j],D[iPiPs])==1)+iP-1
if (length(POS)==1){
Seq <- seq(POS,iPs)
END <- which(regexpr(".*\\}",D[Seq])==1)[1]
POSEND <- seq(POS,(POS+END-1))
if (uniqueTag[j]=="C1" & dbsource!="isi"){
DATA[[uniqueTag[j]]][i] <- paste0(DD[POSEND],collapse=";")
} else if (uniqueTag[j]=="C1" & dbsource=="isi"){
DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=";")
}
if (uniqueTag[j]=="CR" & length(POSEND)>1){
DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=";")
} else if (uniqueTag[j]=="CR" & length(POSEND)==1){
DATA[[uniqueTag[j]]][i] <- paste0(DD[POSEND],collapse=";")
}
if (uniqueTag[j]!="C1" & uniqueTag[j]!="CR"){
DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=" ")}
if (uniqueTag[j]=="DI"){
DOI <- gsub("doi = ","",DD[POS])
DATA[[uniqueTag[j]]][i] <- gsub(",","",DOI)
}
}
}
}
if ("DT2" %in% names(DATA)){
DATA$DT2=substr(DATA$DT2,1,regexpr("\\{",DATA$DT2)-1)}
# remove tags from fields
for (i in 1:length(Tag)){
pattern=substr(DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])][1],1,regexpr("=",DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])][1]))
DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])] <- gsub(pattern,"",DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])],fixed=TRUE)
}
## removing { and }
DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\{","",d)),stringsAsFactors = FALSE)
DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\},","",d)),stringsAsFactors = FALSE)
DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\}","",d)),stringsAsFactors = FALSE)
DATA <- postprocessing(DATA, dbsource)
return(DATA)
}
### TEXT file preporcessing
preprocessing <- function(D){
## normalize bibtex data
ind=which(regexpr("\\@",D)==1)
D=trim(D)
#D=gsub("\\.\\s+",";",D) ## for references
D=gsub("\\s+", " ",D)
D=gsub("\\{\\[\\}","\\[",D)
#D=gsub("\\}\\]\\}","\\]",D)
D=gsub("\\{\\{","\\{",D)
D=gsub("\\}\\}","\\}",D)
D=gsub("\\{''\\}","",D)
D=gsub(" = ","=",D)
#ind=which(regexpr("\\@",D)==1)
D1=D
D1[ind]=gsub("\\@","manuscript={",D[ind])
D[ind] <- gsub("\\@","manuscript=",D[ind])
D1=D1[which(regexpr("=\\{",D1)>-1)]
Tag<-unique(gsub("(=\\{).*","\\1",D1))
D=gsub("@","",D)
txt=list(D=tolower(D),DD=D,Tag=tolower(Tag))
return(txt)
}
### DATA FRAME postprocessing
postprocessing <-function(DATA,dbsource){
# Authors' names cleaning (surname and initials)
#remove ; and 2 or more spaces
DATA$AU=gsub("\\s+", " ", DATA$AU)
listAU <- strsplit(DATA$AU, " and ")
AU <- lapply(listAU,function(l){
lastname <- trim(gsub(",.*","",l))
firstname <- strsplit(trim(gsub(".*,","",l))," ")
firstname <- gsub("[^:A-Z:]","",firstname)
AU <- paste(lastname,unlist(firstname),sep=" ",collapse=";")
return(AU)
})
DATA$AU <- unlist(AU)
# TC post-processing
if ("TC" %in% names(DATA)){
DATA$TC <- as.numeric(sub("\\D*(\\d+).*", "\\1", DATA$TC))
}
# Year
if ("PY" %in% names(DATA)){
DATA$PY <- as.numeric(sub("\\D*(\\d+).*", "\\1", DATA$PY))
}
if ("UT" %in% names(DATA)){
DATA$UT <- gsub(":","",DATA$UT,fixed=TRUE)
}
if (!("RP" %in% names(DATA)) & ("C1" %in% names(DATA))){
DATA$RP <- unlist(lapply(strsplit(DATA$C1,"\\."),function (l) l[1]))
}
# keywords post-processing (missing ";" in some rows)
if ("ID" %in% names(DATA)){
DATA$ID <- gsub(" ",";",DATA$ID)
DATA$ID <- gsub(",",";",DATA$ID)
}
if ("DE" %in% names(DATA)){
DATA$DE <- gsub(" ",";",DATA$DE)
DATA$DE <- gsub(",",";",DATA$DE)
}
#row.names(DATA)=DATA$UT
### merge Sources and Proceedings
if (("SO" %in% names(DATA)) & ("BO" %in% names(DATA))){
ind <- which(is.na(DATA$SO))
DATA$SO[ind] <- DATA$BO[ind]
}
if ("PN" %in% names(DATA)){
DATA$PN <- as.numeric(gsub("[^0-9]", "", DATA$PN))
}
if (dbsource!="generic"){
DATA$DB=dbsource
} else {DATA$DB="SCOPUS"}
DATA <- data.frame(lapply(DATA,toupper),stringsAsFactors = FALSE)
return(DATA)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment