Created
November 7, 2011 17:06
-
-
Save jimmyrcom/1345530 to your computer and use it in GitHub Desktop.
Revisions
-
jimmyrcom created this gist
Nov 7, 2011 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,349 @@ /* %% By 2011 Jimmy Ruska (JimmyR.com), % Licensed under the Apache License, Version 2.0 (the "License"); you may not % use this file except in compliance with the License. You may obtain a copy of % the License at % % http://www.apache.org/licenses/LICENSE-2.0 % % Unless required by applicable law or agreed to in writing, software % distributed under the License is distributed on an "AS IS" BASIS, WITHOUT % WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the % License for the specific language governing permissions and limitations under % the License. alias nif="gcc -O3 -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c -I /usr/local/lib/erlang/erts-5.8.4/include" nif && cp crawler_utils.erl.so ~/e/crawler/c_src/ cd ~/c/; gcc -O3 -fno-optimize-sibling-calls -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c parse_url.c -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -lglib-2.0 -I /usr/local/lib/erlang/erts-5.8.4/include && cp ~/c/crawler_utils.erl.c ~/e/crawler/c_src/ && cp ~/c/parse_url.c ~/e/crawler/c_src/ && cp ~/c/parse_url.h ~/e/crawler/c_src/ maybe something to capitalize first char The functions htoi and url_decode are based on php source under this license. +----------------------------------------------------------------------+ | PHP Version 5 | +----------------------------------------------------------------------+ | Copyright (c) 1997-2011 The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | [email protected] so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Author: Jim Winstead <[email protected]> | +----------------------------------------------------------------------+ */ #include "erl_nif.h" #include "stdio.h" #include <ctype.h> #include "parse_url.h" //#include <string.h> void reverse_binary_unsafe(unsigned char* bin, size_t len); void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len); int hex_to_binary1(unsigned char* from,unsigned char* to, size_t len); char hex_char_to_int(const char n); char int_to_hex_char(const char n); static unsigned int htoi(unsigned char *s); int url_decode_unsafe1(unsigned char *str, int len); static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info); static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info); static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info); static void unload(ErlNifEnv* env, void* priv); static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) { return 0; } static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) { return 0; } static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info) { return 0; } static void unload(ErlNifEnv* env, void* priv) { return; } static ERL_NIF_TERM reverse_binary_unsafe1(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; enif_inspect_iolist_as_binary(env, argv[0], &block); reverse_binary_unsafe(block.data,block.size-1); return enif_make_binary(env, &block); } void reverse_binary_unsafe(unsigned char *bin, size_t len){ unsigned int i; char swap; for (i=0; i<len; i++, len--){ swap=bin[i]; bin[i] = bin[len]; bin[len]=swap; } } static ERL_NIF_TERM hex_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; size_t new_size; ErlNifBinary ret; enif_inspect_iolist_as_binary(env, argv[0], &block); if (block.size % 2) return enif_make_atom(env, "error"); if (block.size==0) return enif_make_binary(env, &block); if ( !enif_alloc_binary(block.size / 2, &ret) ) return enif_make_atom(env, "error"); new_size=hex_to_binary1(block.data,ret.data,block.size); if (!enif_realloc_binary(&ret, new_size) ) return enif_make_atom(env, "error"); return enif_make_binary(env, &ret); } static ERL_NIF_TERM hex_to_binary_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; size_t new_size; enif_inspect_iolist_as_binary(env, argv[0], &block); if(block.size==0) return enif_make_binary(env, &block); if (block.size % 2) return enif_make_atom(env, "error"); new_size=hex_to_binary1(block.data,block.data,block.size); if (!enif_realloc_binary(&block, new_size) ) return enif_make_atom(env, "error"); return enif_make_binary(env, &block); } int hex_to_binary1(unsigned char* from, unsigned char* to, size_t len){ int i,n; n=0; for(i=0;i<len-1;i++){ switch(from[i]){ case '\t': case ' ': case '\r': case '\n': continue; } //fprintf(stderr,"%i,",n); to[n++]=hex_char_to_int(from[i])*16 + hex_char_to_int(from[i+1]); i++; } return n; } char hex_char_to_int(const char n){ if (n>96 && n<103) return n-97+10; else if (n>64 && n<71) return n-65+10; else if (n>47 && n<58) return n-48; else return 48; } ///////////// static ERL_NIF_TERM binary_to_hex(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; size_t ret_size; ErlNifBinary ret; enif_inspect_iolist_as_binary(env, argv[0], &block); if(block.size==0) return enif_make_binary(env, &block); ret_size = block.size * 2; if ( !enif_alloc_binary(ret_size, &ret) ) return enif_make_atom(env, "error"); binary_to_hex1(block.data,ret.data,block.size); return enif_make_binary(env, &ret); } void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len){ unsigned int i,n; unsigned char x; //fprintf(stderr,"Len: %i --",(int)len); for(i=n=0;i<len;){ x=bin[i++]; // fprintf(stderr,"{%i,%i}",x>>4,x & 15); dest[n++]=int_to_hex_char(x >> 4); dest[n++]=int_to_hex_char(x & 15); } } char int_to_hex_char(const char n){ if (n<10) return n+'0'; else if (n>9) return n+'A'-10; else return '0'; } static ERL_NIF_TERM binary_to_lower_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; unsigned int i; unsigned char x; enif_inspect_iolist_as_binary(env, argv[0], &block); for (i=0;i<block.size;i++){ x=block.data[i]; if (x>'A'-1 && x<'Z'+1) block.data[i]=x+32; } return enif_make_binary(env, &block); } static ERL_NIF_TERM binary_to_upper_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; unsigned int i; unsigned char x; enif_inspect_iolist_as_binary(env, argv[0], &block); for (i=0;i<block.size;i++){ x=block.data[i]; if (x>'a'-1 && x<'z'+1) block.data[i]=x-32; } return enif_make_binary(env, &block); } /////// /* static ERL_NIF_TERM binary_string_to_integer(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) */ /* { */ /* ErlNifBinary block; */ /* unsigned total; */ /* unsigned char *p, *stop; */ /* enif_inspect_binary(env, argv[0], &block); */ /* p=&block.data[0]; */ /* for (total=0,stop=p+block.size;p<stop;p++){ */ /* if (isdigit(*p)){ */ /* total*=10; */ /* total+=*p-'0'; */ /* } */ /* } */ /* return enif_make_int(env, total); */ /* } */ static ERL_NIF_TERM url_encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block, ret; unsigned int i; unsigned char *p, *stop; enif_inspect_binary(env, argv[0], &block); if (block.size==0) return enif_make_binary(env, &block); if ( !enif_alloc_binary(block.size*3, &ret) ) return enif_make_atom(env, "error"); p=&block.data[0]; for (i=0,stop=p+block.size;p<stop;p++){ if ((*p > '0'-1 && *p < '9'+1) || (*p > 'a'-1 && *p < 'z'+1) || (*p > 'A'-1 && *p < 'Z'+1) || *p=='-' || *p=='_' || *p=='.' || *p=='~'){ ret.data[i++]=*p; } else{ ret.data[i++]='%'; ret.data[i++]=int_to_hex_char(*p >> 4); ret.data[i++]=int_to_hex_char(*p & 15); } } if (!enif_realloc_binary(&ret, i) ) return enif_make_atom(env, "error"); return enif_make_binary(env, &ret); } static ERL_NIF_TERM binary_trim(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; unsigned int start=0, end; enif_inspect_iolist_as_binary(env, argv[0], &block); if (block.size==0) return enif_make_binary(env, &block); // int enif_is_list(ErlNifEnv* env, ERL_NIF_TERM term) // ERL_NIF_TERM enif_make_binary(ErlNifEnv* env, ErlNifBinary* bin) // int enif_alloc_binary(size_t size, ErlNifBinary* bin) // int enif_inspect_iolist_as_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin) // ERL_NIF_TERM enif_make_sub_binary(ErlNifEnv* env, ERL_NIF_TERM bin_term, size_t pos, size_t size) end=block.size-1; //fprintf(stderr,"start %i end %i,",start,end); while (isspace(block.data[start])){ start++; // scanned all the way to the end if (start>end){ if (!enif_realloc_binary(&block, 0)) return enif_make_atom(env, "error"); return enif_make_binary(env, &block); } } while (isspace(block.data[end])>0 && end>0) end--; end=block.size-1-end; if (start==0 && end==0) return enif_make_binary(env, &block); return enif_make_sub_binary(env, enif_make_binary(env,&block), start, block.size-start-end); } //url decode static ERL_NIF_TERM url_decode_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; unsigned int new_size; enif_inspect_iolist_as_binary(env, argv[0], &block); new_size=url_decode_unsafe1(block.data,block.size); if (!enif_realloc_binary(&block, new_size) ) return enif_make_atom(env, "error"); return enif_make_binary(env, &block); } int url_decode_unsafe1(unsigned char *str, int len) { unsigned char *dest = str; unsigned char *data = str; while (len--) { if (*data == '+') *dest = ' '; else if (*data == '%' && len >= 2 && isdigit((int) *(data + 1)) && isdigit((int) *(data + 2))) { *dest = (unsigned char) htoi(data + 1); data += 2; len -= 2; } else *dest = *data; data++; dest++; } return dest - str; } static unsigned int htoi(unsigned char *s) { //hex to integer unsigned int value; int c; c = ((unsigned char *)s)[0]; if (isupper(c)) c = tolower(c); value = (c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10) * 16; c = ((unsigned char *)s)[1]; if (isupper(c)) c = tolower(c); value += c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10; return (value); } // end url decode static ErlNifFunc nif_funcs[] = { {"h2b_unsafe", 1, hex_to_binary_unsafe} ,{"h2b", 1, hex_to_binary} ,{"b2h", 1, binary_to_hex} ,{"trim", 1, binary_trim} ,{"reverse_bin_unsafe", 1, reverse_binary_unsafe1} ,{"lower_unsafe", 1, binary_to_lower_unsafe} ,{"upper_unsafe", 1, binary_to_upper_unsafe} ,{"url_encode", 1, url_encode} ,{"url_decode_unsafe", 1, url_decode_unsafe} ,{"parse_url", 1, parse_url} //,{"bs2i", 1, binary_string_to_integer} }; ERL_NIF_INIT(crawler_utils, nif_funcs, load, reload, upgrade, unload) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,22 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> //memchr #include <stdint.h> //uint32_t #include <stdbool.h> #include "erl_nif.h" struct domain { ErlNifBinary protocol; ErlNifBinary domain; unsigned int port; //ErlNifBinary path; size_t path; bool success; }; void get_protocol(uint8_t* s, struct domain* out, uint8_t* len); inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len); inline void get_port(uint8_t* s, struct domain* out, uint8_t* len); inline void get_path(uint8_t* s, struct domain* out, uint8_t* len); inline void lower(uint8_t* s,uint8_t* end); ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]); This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,123 @@ #include "parse_url.h" #include "string.h" int main(void){ //uint8_t header[]="HTTP/1.1 200 OK\r\nFoo: bar\r\nCookies: Yes Please\r\n\r\n"; uint8_t url[]="http://www.google.com:80/foobar"; struct domain out = {.port=80}; get_protocol(url,&out,url+strlen((char*)url)-1); //printf("%s,%s,%d,%s,%d",out.protocol,out.domain,out.port,out.path,out.success); return 0; } ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary block; ERL_NIF_TERM path, output; struct domain out = {.port=80,.success=false}; if (enif_is_binary(env,argv[0])==false) return enif_make_atom(env, "error"); enif_inspect_binary(env, argv[0], &block); if (block.size<4) return enif_make_atom(env, "error"); get_protocol(block.data,&out,block.data+block.size-1); if (out.success==false) return enif_make_atom(env, "error"); //fprintf(stderr,"%d %d -- //",(int)(block.size-out.path-1),(int)out.path); if (out.path==0){ unsigned char dat[1]={'/'}; ErlNifBinary bin={.size=1,.data=dat}; path=enif_make_binary(env,&bin); } else{ path=enif_make_sub_binary(env, argv[0], block.size-out.path , out.path); } output=enif_make_tuple4(env , enif_make_binary(env, &out.protocol) , enif_make_binary(env, &out.domain) , enif_make_int(env,out.port) , path ); free(out.protocol.data); free(out.domain.data); return output; } inline void get_protocol(uint8_t* s, struct domain* out, uint8_t* len){ uint8_t* i, *protocol; if ((i=memchr(s,':',len-s+1)) != NULL){ protocol = malloc(i-s); memcpy(protocol,s,i-s); //strncpy((char*)protocol,(char *)s,i-s); lower(protocol,protocol+(i-s)+1); out->protocol.size=i-s; out->protocol.data=protocol; // printf("%s\n",out->protocol); return get_domain(i+3,out,len); } else{ protocol=malloc(4); strncpy((char*)protocol,"http",4); out->protocol.size=4; out->protocol.data=protocol; return get_domain(s,out,len); } } inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len){ uint8_t* i=s, *domain; while(*i != ':' && *i !='/' && i<len+1) i++; if (i==s) return; //badarg domain = malloc(i-s); memcpy(domain,s,i-s); // strncpy((char*)domain,(char *)s,i-s); lower(domain,domain+(i-s)+1); out->domain.size=i-s; out->domain.data=domain; // printf("%s\n",out->domain); if (*i == '/'){ return get_path(i,out,len); } else if(*i==':'){ return get_port(i+1,out,len); } else if(i>len-1){ out->success=true; return; } else return; //badarg } inline void get_port(uint8_t* s, struct domain* out, uint8_t* len){ uint8_t* i=s, *port; i=memchr(s,'/',len-s+1); if (i==s || i-s>5) return; //badarg if (i==NULL){ out->success=true; return; } else { port=malloc(6); strncpy((char*)port,(char *)s,i-s); port[5]='\0'; out->port=atoi((char*)port); free(port); return get_path(i,out,len); } } inline void get_path(uint8_t* s, struct domain* out, uint8_t* len){ out->path=len-s+1; /* uint8_t *path; */ /* out->success=true; */ /* if (len-s==-1) return; */ /* path = malloc(len-s+1); */ /* strncpy((char*)path,(char *)s,len-s+1); */ out->success=true; /* out->path.size=len-s+1; */ /* out->path.data=path; */ // printf("%s\n",out->path); } inline void lower(uint8_t* s,uint8_t* end){ while(s++<end) if (*s>'A'-1 && *s<'Z'+1) *s+=32; }