Last active
May 25, 2022 16:04
-
-
Save Eduardodudu/d16a07977689370769ea650cf1d16691 to your computer and use it in GitHub Desktop.
[Bibliometrix Adjustment] allowing for bibtex files #bibtex #bibliometrix
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Running | |
| library(bibliometrix) | |
| source(file = "./Functions_Bibliometrix.R") #Bypass Bibliometrix | |
| # reading bibtex file | |
| data <- readFiles("data.bib") | |
| # Converting the loaded files into a R bibliographic dataframe | |
| M <- convert2df(data, dbsource = "isi", format = "bibtex") | |
| #Now you're all set to use bibliometrix functionalities |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Functions | |
| readFiles <- function(...){ | |
| arguments <- unlist(list(...)) | |
| k=length(arguments) | |
| D=list() | |
| enc="UTF-8" | |
| origEnc=getOption("encoding") | |
| if (origEnc=="UTF-8"){options(encoding = "native.enc")} | |
| for (i in 1:k){ | |
| D[[i]]=suppressWarnings( | |
| iconv(readLines(arguments[i],encoding = "UTF-8"),"latin1", "ASCII", sub="") | |
| #conv(readLines(arguments[[i]])) | |
| ) | |
| } | |
| D=unlist(D) | |
| options(encoding = origEnc) | |
| return(D) | |
| } | |
| convert2df<-function(file,dbsource="wos",format="plaintext"){ | |
| cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n") | |
| if (length(setdiff(dbsource,c("isi","wos","scopus","pubmed","cochrane","generic")))>0){ | |
| cat("\n 'dbsource' argument is not properly specified") | |
| cat("\n 'dbsource' argument has to be a character string matching 'isi, 'wos', 'scopus', 'generic', or 'pubmed'.\n")} | |
| if (length(setdiff(format,c("plaintext","bibtex","pubmed","cochrane")))>0){ | |
| cat("\n 'format' argument is not properly specified") | |
| cat("\n 'format' argument has to be a character string matching 'plaintext or 'bibtex'.\n")} | |
| if (length(setdiff(format,c("plaintext","bibtex")))>0){ | |
| file=iconv(file, "latin1", "ASCII", sub="")} | |
| if (dbsource=="wos") dbsource="isi" | |
| switch(dbsource, | |
| isi={ | |
| switch(format, | |
| bibtex={M=bib2df(file,dbsource="isi")}, | |
| plaintext={M=isi2df(file)} | |
| )}, | |
| scopus={M=bib2df(file,dbsource="scopus") | |
| }, | |
| generic={M=bib2df(file,dbsource="generic") | |
| }, | |
| pubmed={M=pubmed2df(file) | |
| }, | |
| cochrane={M=cochrane2df(file) | |
| } | |
| ) | |
| if ("PY" %in% names(M)){M$PY=as.numeric(M$PY)} else {M$PY=NA} | |
| if ("TC" %in% names(M)){M$TC=as.numeric(M$TC)} else {M$TC=NA} | |
| if (!("CR" %in% names(M))){M$CR="none"} | |
| if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)} | |
| cat("Done!\n\n") | |
| ## AU_UN field creation | |
| if ("C1" %in% names(M)){ | |
| cat("\nGenerating affiliation field tag AU_UN from C1: ") | |
| M <- metaTagExtraction(M, Field="AU_UN") | |
| cat("Done!\n\n") | |
| } else{ | |
| M$C1=NA | |
| M$AU_UN=NA} | |
| ### SR field creation | |
| suppressWarnings(M <- metaTagExtraction(M, Field="SR")) | |
| ## AU normalization | |
| M$AU=unlist(lapply(strsplit(M$AU,";"), function(x){ | |
| x=trimws(trimES(gsub("[[:punct:]]"," ",x))) | |
| x=paste(x,collapse=";") | |
| })) | |
| ### identify duplicated SRs | |
| SR=M$SR | |
| tab=table(SR) | |
| tab2=table(tab) | |
| ind=as.numeric(names(tab2)) | |
| ind=ind[which(ind>1)] | |
| if (length(ind)>0){ | |
| for (i in ind){ | |
| indice=names(which(tab==i)) | |
| for (j in indice){ | |
| indice2=which(SR==j) | |
| SR[indice2]=paste(SR[indice2],as.character(1:length(indice2)),sep=" ") | |
| } | |
| } | |
| } | |
| row.names(M) <- SR | |
| return(M) | |
| } | |
| bib2df<-function(D, dbsource="isi"){ | |
| bibtag <- NULL | |
| data("bibtag",envir=environment()) | |
| bibtag=as.data.frame(bibtag) | |
| txt <- preprocessing(D) | |
| D=txt$D | |
| DD=txt$DD | |
| Tag=txt$Tag | |
| switch(dbsource, | |
| isi={ | |
| ind=which(bibtag[,"ISI"] %in% Tag) | |
| bibtag2=bibtag[ind,c("TAG","ISI")]}, | |
| scopus={ | |
| ind=which(bibtag[,"SCOPUS"] %in% Tag) | |
| bibtag2=bibtag[ind,c("TAG","SCOPUS")] | |
| }, | |
| generic={ | |
| ind=which(bibtag[,"GENERIC"] %in% Tag) | |
| bibtag2=bibtag[ind,c("TAG","GENERIC")] | |
| }) | |
| uniqueTag=bibtag2$TAG | |
| Tag=gsub("\\{","",bibtag2[,2]) | |
| # first row of each document | |
| Papers <- which(regexpr("manuscript=",D)==1) | |
| Papers <- c(Papers,length(D)) | |
| # number of documents | |
| nP <- length(Papers)-1 | |
| DATA <- data.frame(matrix(NA,nP,length(uniqueTag))) | |
| names(DATA) <- uniqueTag | |
| for (i in 1:nP){ | |
| if (!is.null(shiny::getDefaultReactiveDomain())){shiny::incProgress(1/nP)} | |
| #print(i) | |
| iP <- Papers[i] | |
| iPs <- Papers[i+1]-1 | |
| if (i%%100==0 | i==nP) cat("Articles extracted ",i,"\n") | |
| iPiPs <- seq(iP,iPs) | |
| for (j in 1:length(Tag)){ | |
| #print(Tag[j]) | |
| POS <- which(regexpr(Tag[j],D[iPiPs])==1)+iP-1 | |
| if (length(POS)==1){ | |
| Seq <- seq(POS,iPs) | |
| END <- which(regexpr(".*\\}",D[Seq])==1)[1] | |
| POSEND <- seq(POS,(POS+END-1)) | |
| if (uniqueTag[j]=="C1" & dbsource!="isi"){ | |
| DATA[[uniqueTag[j]]][i] <- paste0(DD[POSEND],collapse=";") | |
| } else if (uniqueTag[j]=="C1" & dbsource=="isi"){ | |
| DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=";") | |
| } | |
| if (uniqueTag[j]=="CR" & length(POSEND)>1){ | |
| DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=";") | |
| } else if (uniqueTag[j]=="CR" & length(POSEND)==1){ | |
| DATA[[uniqueTag[j]]][i] <- paste0(DD[POSEND],collapse=";") | |
| } | |
| if (uniqueTag[j]!="C1" & uniqueTag[j]!="CR"){ | |
| DATA[[uniqueTag[j]]][i] <- paste0(gsub(";",",",DD[POSEND]),collapse=" ")} | |
| if (uniqueTag[j]=="DI"){ | |
| DOI <- gsub("doi = ","",DD[POS]) | |
| DATA[[uniqueTag[j]]][i] <- gsub(",","",DOI) | |
| } | |
| } | |
| } | |
| } | |
| if ("DT2" %in% names(DATA)){ | |
| DATA$DT2=substr(DATA$DT2,1,regexpr("\\{",DATA$DT2)-1)} | |
| # remove tags from fields | |
| for (i in 1:length(Tag)){ | |
| pattern=substr(DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])][1],1,regexpr("=",DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])][1])) | |
| DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])] <- gsub(pattern,"",DATA[[uniqueTag[i]]][!is.na(DATA[[uniqueTag[i]]])],fixed=TRUE) | |
| } | |
| ## removing { and } | |
| DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\{","",d)),stringsAsFactors = FALSE) | |
| DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\},","",d)),stringsAsFactors = FALSE) | |
| DATA <- as.data.frame(apply(DATA,2,function(d) gsub("\\}","",d)),stringsAsFactors = FALSE) | |
| DATA <- postprocessing(DATA, dbsource) | |
| return(DATA) | |
| } | |
| ### TEXT file preporcessing | |
| preprocessing <- function(D){ | |
| ## normalize bibtex data | |
| ind=which(regexpr("\\@",D)==1) | |
| D=trim(D) | |
| #D=gsub("\\.\\s+",";",D) ## for references | |
| D=gsub("\\s+", " ",D) | |
| D=gsub("\\{\\[\\}","\\[",D) | |
| #D=gsub("\\}\\]\\}","\\]",D) | |
| D=gsub("\\{\\{","\\{",D) | |
| D=gsub("\\}\\}","\\}",D) | |
| D=gsub("\\{''\\}","",D) | |
| D=gsub(" = ","=",D) | |
| #ind=which(regexpr("\\@",D)==1) | |
| D1=D | |
| D1[ind]=gsub("\\@","manuscript={",D[ind]) | |
| D[ind] <- gsub("\\@","manuscript=",D[ind]) | |
| D1=D1[which(regexpr("=\\{",D1)>-1)] | |
| Tag<-unique(gsub("(=\\{).*","\\1",D1)) | |
| D=gsub("@","",D) | |
| txt=list(D=tolower(D),DD=D,Tag=tolower(Tag)) | |
| return(txt) | |
| } | |
| ### DATA FRAME postprocessing | |
| postprocessing <-function(DATA,dbsource){ | |
| # Authors' names cleaning (surname and initials) | |
| #remove ; and 2 or more spaces | |
| DATA$AU=gsub("\\s+", " ", DATA$AU) | |
| listAU <- strsplit(DATA$AU, " and ") | |
| AU <- lapply(listAU,function(l){ | |
| lastname <- trim(gsub(",.*","",l)) | |
| firstname <- strsplit(trim(gsub(".*,","",l))," ") | |
| firstname <- gsub("[^:A-Z:]","",firstname) | |
| AU <- paste(lastname,unlist(firstname),sep=" ",collapse=";") | |
| return(AU) | |
| }) | |
| DATA$AU <- unlist(AU) | |
| # TC post-processing | |
| if ("TC" %in% names(DATA)){ | |
| DATA$TC <- as.numeric(sub("\\D*(\\d+).*", "\\1", DATA$TC)) | |
| } | |
| # Year | |
| if ("PY" %in% names(DATA)){ | |
| DATA$PY <- as.numeric(sub("\\D*(\\d+).*", "\\1", DATA$PY)) | |
| } | |
| if ("UT" %in% names(DATA)){ | |
| DATA$UT <- gsub(":","",DATA$UT,fixed=TRUE) | |
| } | |
| if (!("RP" %in% names(DATA)) & ("C1" %in% names(DATA))){ | |
| DATA$RP <- unlist(lapply(strsplit(DATA$C1,"\\."),function (l) l[1])) | |
| } | |
| # keywords post-processing (missing ";" in some rows) | |
| if ("ID" %in% names(DATA)){ | |
| DATA$ID <- gsub(" ",";",DATA$ID) | |
| DATA$ID <- gsub(",",";",DATA$ID) | |
| } | |
| if ("DE" %in% names(DATA)){ | |
| DATA$DE <- gsub(" ",";",DATA$DE) | |
| DATA$DE <- gsub(",",";",DATA$DE) | |
| } | |
| #row.names(DATA)=DATA$UT | |
| ### merge Sources and Proceedings | |
| if (("SO" %in% names(DATA)) & ("BO" %in% names(DATA))){ | |
| ind <- which(is.na(DATA$SO)) | |
| DATA$SO[ind] <- DATA$BO[ind] | |
| } | |
| if ("PN" %in% names(DATA)){ | |
| DATA$PN <- as.numeric(gsub("[^0-9]", "", DATA$PN)) | |
| } | |
| if (dbsource!="generic"){ | |
| DATA$DB=dbsource | |
| } else {DATA$DB="SCOPUS"} | |
| DATA <- data.frame(lapply(DATA,toupper),stringsAsFactors = FALSE) | |
| return(DATA) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment