Skip to content

Instantly share code, notes, and snippets.

@jimmyrcom
Created November 7, 2011 17:06
Show Gist options
  • Save jimmyrcom/1345530 to your computer and use it in GitHub Desktop.
Save jimmyrcom/1345530 to your computer and use it in GitHub Desktop.

Revisions

  1. jimmyrcom created this gist Nov 7, 2011.
    349 changes: 349 additions & 0 deletions crawler_utils.erl.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,349 @@
    /*
    %% By 2011 Jimmy Ruska (JimmyR.com),
    % Licensed under the Apache License, Version 2.0 (the "License"); you may not
    % use this file except in compliance with the License. You may obtain a copy of
    % the License at
    %
    % http://www.apache.org/licenses/LICENSE-2.0
    %
    % Unless required by applicable law or agreed to in writing, software
    % distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
    % WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
    % License for the specific language governing permissions and limitations under
    % the License.
    alias nif="gcc -O3 -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c -I /usr/local/lib/erlang/erts-5.8.4/include"
    nif && cp crawler_utils.erl.so ~/e/crawler/c_src/
    cd ~/c/; gcc -O3 -fno-optimize-sibling-calls -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c parse_url.c -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -lglib-2.0 -I /usr/local/lib/erlang/erts-5.8.4/include && cp ~/c/crawler_utils.erl.c ~/e/crawler/c_src/ && cp ~/c/parse_url.c ~/e/crawler/c_src/ && cp ~/c/parse_url.h ~/e/crawler/c_src/
    maybe something to capitalize first char
    The functions htoi and url_decode are based on php source under this license.
    +----------------------------------------------------------------------+
    | PHP Version 5 |
    +----------------------------------------------------------------------+
    | Copyright (c) 1997-2011 The PHP Group |
    +----------------------------------------------------------------------+
    | This source file is subject to version 3.01 of the PHP license, |
    | that is bundled with this package in the file LICENSE, and is |
    | available through the world-wide-web at the following url: |
    | http://www.php.net/license/3_01.txt |
    | If you did not receive a copy of the PHP license and are unable to |
    | obtain it through the world-wide-web, please send a note to |
    | [email protected] so we can mail you a copy immediately. |
    +----------------------------------------------------------------------+
    | Author: Jim Winstead <[email protected]> |
    +----------------------------------------------------------------------+
    */

    #include "erl_nif.h"
    #include "stdio.h"
    #include <ctype.h>
    #include "parse_url.h"
    //#include <string.h>
    void reverse_binary_unsafe(unsigned char* bin, size_t len);
    void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len);
    int hex_to_binary1(unsigned char* from,unsigned char* to, size_t len);
    char hex_char_to_int(const char n);
    char int_to_hex_char(const char n);
    static unsigned int htoi(unsigned char *s);
    int url_decode_unsafe1(unsigned char *str, int len);

    static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info);
    static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info);
    static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info);
    static void unload(ErlNifEnv* env, void* priv);

    static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) {
    return 0;
    }

    static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) {
    return 0;
    }

    static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info) {
    return 0;
    }

    static void unload(ErlNifEnv* env, void* priv) {
    return;
    }

    static ERL_NIF_TERM reverse_binary_unsafe1(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    reverse_binary_unsafe(block.data,block.size-1);

    return enif_make_binary(env, &block);
    }
    void reverse_binary_unsafe(unsigned char *bin, size_t len){
    unsigned int i;
    char swap;
    for (i=0; i<len; i++, len--){
    swap=bin[i];
    bin[i] = bin[len];
    bin[len]=swap;
    }
    }
    static ERL_NIF_TERM hex_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    size_t new_size;
    ErlNifBinary ret;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    if (block.size % 2) return enif_make_atom(env, "error");
    if (block.size==0) return enif_make_binary(env, &block);
    if ( !enif_alloc_binary(block.size / 2, &ret) )
    return enif_make_atom(env, "error");
    new_size=hex_to_binary1(block.data,ret.data,block.size);
    if (!enif_realloc_binary(&ret, new_size) )
    return enif_make_atom(env, "error");
    return enif_make_binary(env, &ret);
    }

    static ERL_NIF_TERM hex_to_binary_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    size_t new_size;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    if(block.size==0) return enif_make_binary(env, &block);
    if (block.size % 2) return enif_make_atom(env, "error");
    new_size=hex_to_binary1(block.data,block.data,block.size);
    if (!enif_realloc_binary(&block, new_size) )
    return enif_make_atom(env, "error");
    return enif_make_binary(env, &block);
    }
    int hex_to_binary1(unsigned char* from, unsigned char* to, size_t len){
    int i,n;
    n=0;
    for(i=0;i<len-1;i++){
    switch(from[i]){
    case '\t':
    case ' ':
    case '\r':
    case '\n':
    continue;
    }
    //fprintf(stderr,"%i,",n);
    to[n++]=hex_char_to_int(from[i])*16 + hex_char_to_int(from[i+1]);
    i++;
    }
    return n;
    }
    char hex_char_to_int(const char n){
    if (n>96 && n<103) return n-97+10;
    else if (n>64 && n<71) return n-65+10;
    else if (n>47 && n<58) return n-48;
    else return 48;
    }

    /////////////
    static ERL_NIF_TERM binary_to_hex(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    size_t ret_size;
    ErlNifBinary ret;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    if(block.size==0) return enif_make_binary(env, &block);
    ret_size = block.size * 2;
    if ( !enif_alloc_binary(ret_size, &ret) )
    return enif_make_atom(env, "error");
    binary_to_hex1(block.data,ret.data,block.size);
    return enif_make_binary(env, &ret);
    }
    void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len){
    unsigned int i,n;
    unsigned char x;
    //fprintf(stderr,"Len: %i --",(int)len);
    for(i=n=0;i<len;){
    x=bin[i++];
    // fprintf(stderr,"{%i,%i}",x>>4,x & 15);
    dest[n++]=int_to_hex_char(x >> 4);
    dest[n++]=int_to_hex_char(x & 15);
    }
    }
    char int_to_hex_char(const char n){
    if (n<10) return n+'0';
    else if (n>9) return n+'A'-10;
    else return '0';
    }

    static ERL_NIF_TERM binary_to_lower_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    unsigned int i;
    unsigned char x;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    for (i=0;i<block.size;i++){
    x=block.data[i];
    if (x>'A'-1 && x<'Z'+1) block.data[i]=x+32;
    }
    return enif_make_binary(env, &block);
    }
    static ERL_NIF_TERM binary_to_upper_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    unsigned int i;
    unsigned char x;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    for (i=0;i<block.size;i++){
    x=block.data[i];
    if (x>'a'-1 && x<'z'+1) block.data[i]=x-32;
    }
    return enif_make_binary(env, &block);
    }

    ///////
    /* static ERL_NIF_TERM binary_string_to_integer(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) */
    /* { */
    /* ErlNifBinary block; */
    /* unsigned total; */
    /* unsigned char *p, *stop; */

    /* enif_inspect_binary(env, argv[0], &block); */
    /* p=&block.data[0]; */
    /* for (total=0,stop=p+block.size;p<stop;p++){ */
    /* if (isdigit(*p)){ */
    /* total*=10; */
    /* total+=*p-'0'; */
    /* } */
    /* } */
    /* return enif_make_int(env, total); */
    /* } */
    static ERL_NIF_TERM url_encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block, ret;
    unsigned int i;
    unsigned char *p, *stop;

    enif_inspect_binary(env, argv[0], &block);
    if (block.size==0) return enif_make_binary(env, &block);
    if ( !enif_alloc_binary(block.size*3, &ret) )
    return enif_make_atom(env, "error");

    p=&block.data[0];
    for (i=0,stop=p+block.size;p<stop;p++){
    if ((*p > '0'-1 && *p < '9'+1)
    || (*p > 'a'-1 && *p < 'z'+1)
    || (*p > 'A'-1 && *p < 'Z'+1)
    || *p=='-' || *p=='_' || *p=='.' || *p=='~'){
    ret.data[i++]=*p;
    }
    else{
    ret.data[i++]='%';
    ret.data[i++]=int_to_hex_char(*p >> 4);
    ret.data[i++]=int_to_hex_char(*p & 15);
    }
    }
    if (!enif_realloc_binary(&ret, i) )
    return enif_make_atom(env, "error");

    return enif_make_binary(env, &ret);
    }
    static ERL_NIF_TERM binary_trim(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    unsigned int start=0, end;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    if (block.size==0) return enif_make_binary(env, &block);
    // int enif_is_list(ErlNifEnv* env, ERL_NIF_TERM term)
    // ERL_NIF_TERM enif_make_binary(ErlNifEnv* env, ErlNifBinary* bin)
    // int enif_alloc_binary(size_t size, ErlNifBinary* bin)
    // int enif_inspect_iolist_as_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin)
    // ERL_NIF_TERM enif_make_sub_binary(ErlNifEnv* env, ERL_NIF_TERM bin_term, size_t pos, size_t size)

    end=block.size-1;
    //fprintf(stderr,"start %i end %i,",start,end);
    while (isspace(block.data[start])){
    start++;
    // scanned all the way to the end
    if (start>end){
    if (!enif_realloc_binary(&block, 0))
    return enif_make_atom(env, "error");
    return enif_make_binary(env, &block);
    }
    }
    while (isspace(block.data[end])>0 && end>0) end--;
    end=block.size-1-end;
    if (start==0 && end==0) return enif_make_binary(env, &block);
    return enif_make_sub_binary(env, enif_make_binary(env,&block), start, block.size-start-end);
    }

    //url decode
    static ERL_NIF_TERM url_decode_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    unsigned int new_size;

    enif_inspect_iolist_as_binary(env, argv[0], &block);
    new_size=url_decode_unsafe1(block.data,block.size);
    if (!enif_realloc_binary(&block, new_size) )
    return enif_make_atom(env, "error");
    return enif_make_binary(env, &block);
    }
    int url_decode_unsafe1(unsigned char *str, int len)
    {
    unsigned char *dest = str;
    unsigned char *data = str;

    while (len--) {
    if (*data == '+') *dest = ' ';
    else if (*data == '%' && len >= 2 && isdigit((int) *(data + 1))
    && isdigit((int) *(data + 2))) {
    *dest = (unsigned char) htoi(data + 1);
    data += 2;
    len -= 2;
    }
    else *dest = *data;
    data++;
    dest++;
    }
    return dest - str;
    }
    static unsigned int htoi(unsigned char *s)
    { //hex to integer
    unsigned int value;
    int c;

    c = ((unsigned char *)s)[0];
    if (isupper(c)) c = tolower(c);
    value = (c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10) * 16;

    c = ((unsigned char *)s)[1];
    if (isupper(c)) c = tolower(c);
    value += c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10;

    return (value);
    }
    // end url decode

    static ErlNifFunc nif_funcs[] =
    {
    {"h2b_unsafe", 1, hex_to_binary_unsafe}
    ,{"h2b", 1, hex_to_binary}
    ,{"b2h", 1, binary_to_hex}
    ,{"trim", 1, binary_trim}
    ,{"reverse_bin_unsafe", 1, reverse_binary_unsafe1}
    ,{"lower_unsafe", 1, binary_to_lower_unsafe}
    ,{"upper_unsafe", 1, binary_to_upper_unsafe}
    ,{"url_encode", 1, url_encode}
    ,{"url_decode_unsafe", 1, url_decode_unsafe}
    ,{"parse_url", 1, parse_url}
    //,{"bs2i", 1, binary_string_to_integer}
    };
    ERL_NIF_INIT(crawler_utils, nif_funcs, load, reload, upgrade, unload)
    22 changes: 22 additions & 0 deletions parse.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,22 @@
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h> //memchr
    #include <stdint.h> //uint32_t
    #include <stdbool.h>
    #include "erl_nif.h"

    struct domain {
    ErlNifBinary protocol;
    ErlNifBinary domain;
    unsigned int port;
    //ErlNifBinary path;
    size_t path;
    bool success;
    };

    void get_protocol(uint8_t* s, struct domain* out, uint8_t* len);
    inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len);
    inline void get_port(uint8_t* s, struct domain* out, uint8_t* len);
    inline void get_path(uint8_t* s, struct domain* out, uint8_t* len);
    inline void lower(uint8_t* s,uint8_t* end);
    ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
    123 changes: 123 additions & 0 deletions parse_url.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,123 @@
    #include "parse_url.h"
    #include "string.h"

    int main(void){
    //uint8_t header[]="HTTP/1.1 200 OK\r\nFoo: bar\r\nCookies: Yes Please\r\n\r\n";
    uint8_t url[]="http://www.google.com:80/foobar";
    struct domain out = {.port=80};
    get_protocol(url,&out,url+strlen((char*)url)-1);
    //printf("%s,%s,%d,%s,%d",out.protocol,out.domain,out.port,out.path,out.success);
    return 0;
    }

    ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
    {
    ErlNifBinary block;
    ERL_NIF_TERM path, output;
    struct domain out = {.port=80,.success=false};

    if (enif_is_binary(env,argv[0])==false) return enif_make_atom(env, "error");
    enif_inspect_binary(env, argv[0], &block);
    if (block.size<4) return enif_make_atom(env, "error");
    get_protocol(block.data,&out,block.data+block.size-1);
    if (out.success==false) return enif_make_atom(env, "error");
    //fprintf(stderr,"%d %d --
    //",(int)(block.size-out.path-1),(int)out.path);
    if (out.path==0){
    unsigned char dat[1]={'/'};
    ErlNifBinary bin={.size=1,.data=dat};
    path=enif_make_binary(env,&bin);
    }
    else{
    path=enif_make_sub_binary(env, argv[0], block.size-out.path , out.path);
    }

    output=enif_make_tuple4(env
    , enif_make_binary(env, &out.protocol)
    , enif_make_binary(env, &out.domain)
    , enif_make_int(env,out.port)
    , path
    );
    free(out.protocol.data);
    free(out.domain.data);
    return output;
    }



    inline void get_protocol(uint8_t* s, struct domain* out, uint8_t* len){
    uint8_t* i, *protocol;
    if ((i=memchr(s,':',len-s+1)) != NULL){
    protocol = malloc(i-s);
    memcpy(protocol,s,i-s);
    //strncpy((char*)protocol,(char *)s,i-s);
    lower(protocol,protocol+(i-s)+1);
    out->protocol.size=i-s;
    out->protocol.data=protocol;
    // printf("%s\n",out->protocol);
    return get_domain(i+3,out,len);
    } else{
    protocol=malloc(4);
    strncpy((char*)protocol,"http",4);
    out->protocol.size=4;
    out->protocol.data=protocol;
    return get_domain(s,out,len);
    }
    }

    inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len){
    uint8_t* i=s, *domain;
    while(*i != ':' && *i !='/' && i<len+1) i++;

    if (i==s) return; //badarg
    domain = malloc(i-s);
    memcpy(domain,s,i-s);
    // strncpy((char*)domain,(char *)s,i-s);
    lower(domain,domain+(i-s)+1);
    out->domain.size=i-s;
    out->domain.data=domain;
    // printf("%s\n",out->domain);
    if (*i == '/'){
    return get_path(i,out,len);
    } else if(*i==':'){
    return get_port(i+1,out,len);
    } else if(i>len-1){
    out->success=true;
    return;
    } else return; //badarg
    }


    inline void get_port(uint8_t* s, struct domain* out, uint8_t* len){
    uint8_t* i=s, *port;
    i=memchr(s,'/',len-s+1);
    if (i==s || i-s>5) return; //badarg
    if (i==NULL){
    out->success=true;
    return;
    } else {
    port=malloc(6);
    strncpy((char*)port,(char *)s,i-s);
    port[5]='\0';
    out->port=atoi((char*)port);
    free(port);
    return get_path(i,out,len);
    }
    }

    inline void get_path(uint8_t* s, struct domain* out, uint8_t* len){
    out->path=len-s+1;
    /* uint8_t *path; */
    /* out->success=true; */
    /* if (len-s==-1) return; */
    /* path = malloc(len-s+1); */
    /* strncpy((char*)path,(char *)s,len-s+1); */
    out->success=true;
    /* out->path.size=len-s+1; */
    /* out->path.data=path; */
    // printf("%s\n",out->path);
    }

    inline void lower(uint8_t* s,uint8_t* end){
    while(s++<end) if (*s>'A'-1 && *s<'Z'+1) *s+=32;
    }