Skip to content

Instantly share code, notes, and snippets.

@jimmyrcom
Created November 7, 2011 17:06
Show Gist options
  • Save jimmyrcom/1345530 to your computer and use it in GitHub Desktop.
Save jimmyrcom/1345530 to your computer and use it in GitHub Desktop.
my nifs
/*
%% By 2011 Jimmy Ruska (JimmyR.com),
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.
alias nif="gcc -O3 -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c -I /usr/local/lib/erlang/erts-5.8.4/include"
nif && cp crawler_utils.erl.so ~/e/crawler/c_src/
cd ~/c/; gcc -O3 -fno-optimize-sibling-calls -Wall -fPIC -shared -o crawler_utils.erl.so crawler_utils.erl.c parse_url.c -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -lglib-2.0 -I /usr/local/lib/erlang/erts-5.8.4/include && cp ~/c/crawler_utils.erl.c ~/e/crawler/c_src/ && cp ~/c/parse_url.c ~/e/crawler/c_src/ && cp ~/c/parse_url.h ~/e/crawler/c_src/
maybe something to capitalize first char
The functions htoi and url_decode are based on php source under this license.
+----------------------------------------------------------------------+
| PHP Version 5 |
+----------------------------------------------------------------------+
| Copyright (c) 1997-2011 The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| [email protected] so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Author: Jim Winstead <[email protected]> |
+----------------------------------------------------------------------+
*/
#include "erl_nif.h"
#include "stdio.h"
#include <ctype.h>
#include "parse_url.h"
//#include <string.h>
void reverse_binary_unsafe(unsigned char* bin, size_t len);
void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len);
int hex_to_binary1(unsigned char* from,unsigned char* to, size_t len);
char hex_char_to_int(const char n);
char int_to_hex_char(const char n);
static unsigned int htoi(unsigned char *s);
int url_decode_unsafe1(unsigned char *str, int len);
static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info);
static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info);
static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info);
static void unload(ErlNifEnv* env, void* priv);
static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) {
return 0;
}
static int reload(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) {
return 0;
}
static int upgrade(ErlNifEnv* env, void** priv, void** old_priv, ERL_NIF_TERM load_info) {
return 0;
}
static void unload(ErlNifEnv* env, void* priv) {
return;
}
static ERL_NIF_TERM reverse_binary_unsafe1(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
enif_inspect_iolist_as_binary(env, argv[0], &block);
reverse_binary_unsafe(block.data,block.size-1);
return enif_make_binary(env, &block);
}
void reverse_binary_unsafe(unsigned char *bin, size_t len){
unsigned int i;
char swap;
for (i=0; i<len; i++, len--){
swap=bin[i];
bin[i] = bin[len];
bin[len]=swap;
}
}
static ERL_NIF_TERM hex_to_binary(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
size_t new_size;
ErlNifBinary ret;
enif_inspect_iolist_as_binary(env, argv[0], &block);
if (block.size % 2) return enif_make_atom(env, "error");
if (block.size==0) return enif_make_binary(env, &block);
if ( !enif_alloc_binary(block.size / 2, &ret) )
return enif_make_atom(env, "error");
new_size=hex_to_binary1(block.data,ret.data,block.size);
if (!enif_realloc_binary(&ret, new_size) )
return enif_make_atom(env, "error");
return enif_make_binary(env, &ret);
}
static ERL_NIF_TERM hex_to_binary_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
size_t new_size;
enif_inspect_iolist_as_binary(env, argv[0], &block);
if(block.size==0) return enif_make_binary(env, &block);
if (block.size % 2) return enif_make_atom(env, "error");
new_size=hex_to_binary1(block.data,block.data,block.size);
if (!enif_realloc_binary(&block, new_size) )
return enif_make_atom(env, "error");
return enif_make_binary(env, &block);
}
int hex_to_binary1(unsigned char* from, unsigned char* to, size_t len){
int i,n;
n=0;
for(i=0;i<len-1;i++){
switch(from[i]){
case '\t':
case ' ':
case '\r':
case '\n':
continue;
}
//fprintf(stderr,"%i,",n);
to[n++]=hex_char_to_int(from[i])*16 + hex_char_to_int(from[i+1]);
i++;
}
return n;
}
char hex_char_to_int(const char n){
if (n>96 && n<103) return n-97+10;
else if (n>64 && n<71) return n-65+10;
else if (n>47 && n<58) return n-48;
else return 48;
}
/////////////
static ERL_NIF_TERM binary_to_hex(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
size_t ret_size;
ErlNifBinary ret;
enif_inspect_iolist_as_binary(env, argv[0], &block);
if(block.size==0) return enif_make_binary(env, &block);
ret_size = block.size * 2;
if ( !enif_alloc_binary(ret_size, &ret) )
return enif_make_atom(env, "error");
binary_to_hex1(block.data,ret.data,block.size);
return enif_make_binary(env, &ret);
}
void binary_to_hex1(unsigned char* bin,unsigned char* dest,const size_t len){
unsigned int i,n;
unsigned char x;
//fprintf(stderr,"Len: %i --",(int)len);
for(i=n=0;i<len;){
x=bin[i++];
// fprintf(stderr,"{%i,%i}",x>>4,x & 15);
dest[n++]=int_to_hex_char(x >> 4);
dest[n++]=int_to_hex_char(x & 15);
}
}
char int_to_hex_char(const char n){
if (n<10) return n+'0';
else if (n>9) return n+'A'-10;
else return '0';
}
static ERL_NIF_TERM binary_to_lower_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
unsigned int i;
unsigned char x;
enif_inspect_iolist_as_binary(env, argv[0], &block);
for (i=0;i<block.size;i++){
x=block.data[i];
if (x>'A'-1 && x<'Z'+1) block.data[i]=x+32;
}
return enif_make_binary(env, &block);
}
static ERL_NIF_TERM binary_to_upper_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
unsigned int i;
unsigned char x;
enif_inspect_iolist_as_binary(env, argv[0], &block);
for (i=0;i<block.size;i++){
x=block.data[i];
if (x>'a'-1 && x<'z'+1) block.data[i]=x-32;
}
return enif_make_binary(env, &block);
}
///////
/* static ERL_NIF_TERM binary_string_to_integer(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) */
/* { */
/* ErlNifBinary block; */
/* unsigned total; */
/* unsigned char *p, *stop; */
/* enif_inspect_binary(env, argv[0], &block); */
/* p=&block.data[0]; */
/* for (total=0,stop=p+block.size;p<stop;p++){ */
/* if (isdigit(*p)){ */
/* total*=10; */
/* total+=*p-'0'; */
/* } */
/* } */
/* return enif_make_int(env, total); */
/* } */
static ERL_NIF_TERM url_encode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block, ret;
unsigned int i;
unsigned char *p, *stop;
enif_inspect_binary(env, argv[0], &block);
if (block.size==0) return enif_make_binary(env, &block);
if ( !enif_alloc_binary(block.size*3, &ret) )
return enif_make_atom(env, "error");
p=&block.data[0];
for (i=0,stop=p+block.size;p<stop;p++){
if ((*p > '0'-1 && *p < '9'+1)
|| (*p > 'a'-1 && *p < 'z'+1)
|| (*p > 'A'-1 && *p < 'Z'+1)
|| *p=='-' || *p=='_' || *p=='.' || *p=='~'){
ret.data[i++]=*p;
}
else{
ret.data[i++]='%';
ret.data[i++]=int_to_hex_char(*p >> 4);
ret.data[i++]=int_to_hex_char(*p & 15);
}
}
if (!enif_realloc_binary(&ret, i) )
return enif_make_atom(env, "error");
return enif_make_binary(env, &ret);
}
static ERL_NIF_TERM binary_trim(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
unsigned int start=0, end;
enif_inspect_iolist_as_binary(env, argv[0], &block);
if (block.size==0) return enif_make_binary(env, &block);
// int enif_is_list(ErlNifEnv* env, ERL_NIF_TERM term)
// ERL_NIF_TERM enif_make_binary(ErlNifEnv* env, ErlNifBinary* bin)
// int enif_alloc_binary(size_t size, ErlNifBinary* bin)
// int enif_inspect_iolist_as_binary(ErlNifEnv* env, ERL_NIF_TERM term, ErlNifBinary* bin)
// ERL_NIF_TERM enif_make_sub_binary(ErlNifEnv* env, ERL_NIF_TERM bin_term, size_t pos, size_t size)
end=block.size-1;
//fprintf(stderr,"start %i end %i,",start,end);
while (isspace(block.data[start])){
start++;
// scanned all the way to the end
if (start>end){
if (!enif_realloc_binary(&block, 0))
return enif_make_atom(env, "error");
return enif_make_binary(env, &block);
}
}
while (isspace(block.data[end])>0 && end>0) end--;
end=block.size-1-end;
if (start==0 && end==0) return enif_make_binary(env, &block);
return enif_make_sub_binary(env, enif_make_binary(env,&block), start, block.size-start-end);
}
//url decode
static ERL_NIF_TERM url_decode_unsafe(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
unsigned int new_size;
enif_inspect_iolist_as_binary(env, argv[0], &block);
new_size=url_decode_unsafe1(block.data,block.size);
if (!enif_realloc_binary(&block, new_size) )
return enif_make_atom(env, "error");
return enif_make_binary(env, &block);
}
int url_decode_unsafe1(unsigned char *str, int len)
{
unsigned char *dest = str;
unsigned char *data = str;
while (len--) {
if (*data == '+') *dest = ' ';
else if (*data == '%' && len >= 2 && isdigit((int) *(data + 1))
&& isdigit((int) *(data + 2))) {
*dest = (unsigned char) htoi(data + 1);
data += 2;
len -= 2;
}
else *dest = *data;
data++;
dest++;
}
return dest - str;
}
static unsigned int htoi(unsigned char *s)
{ //hex to integer
unsigned int value;
int c;
c = ((unsigned char *)s)[0];
if (isupper(c)) c = tolower(c);
value = (c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10) * 16;
c = ((unsigned char *)s)[1];
if (isupper(c)) c = tolower(c);
value += c > '0'-1 && c < '9'+1 ? c - '0' : c - 'a' + 10;
return (value);
}
// end url decode
static ErlNifFunc nif_funcs[] =
{
{"h2b_unsafe", 1, hex_to_binary_unsafe}
,{"h2b", 1, hex_to_binary}
,{"b2h", 1, binary_to_hex}
,{"trim", 1, binary_trim}
,{"reverse_bin_unsafe", 1, reverse_binary_unsafe1}
,{"lower_unsafe", 1, binary_to_lower_unsafe}
,{"upper_unsafe", 1, binary_to_upper_unsafe}
,{"url_encode", 1, url_encode}
,{"url_decode_unsafe", 1, url_decode_unsafe}
,{"parse_url", 1, parse_url}
//,{"bs2i", 1, binary_string_to_integer}
};
ERL_NIF_INIT(crawler_utils, nif_funcs, load, reload, upgrade, unload)
#include <stdio.h>
#include <stdlib.h>
#include <string.h> //memchr
#include <stdint.h> //uint32_t
#include <stdbool.h>
#include "erl_nif.h"
struct domain {
ErlNifBinary protocol;
ErlNifBinary domain;
unsigned int port;
//ErlNifBinary path;
size_t path;
bool success;
};
void get_protocol(uint8_t* s, struct domain* out, uint8_t* len);
inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len);
inline void get_port(uint8_t* s, struct domain* out, uint8_t* len);
inline void get_path(uint8_t* s, struct domain* out, uint8_t* len);
inline void lower(uint8_t* s,uint8_t* end);
ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
#include "parse_url.h"
#include "string.h"
int main(void){
//uint8_t header[]="HTTP/1.1 200 OK\r\nFoo: bar\r\nCookies: Yes Please\r\n\r\n";
uint8_t url[]="http://www.google.com:80/foobar";
struct domain out = {.port=80};
get_protocol(url,&out,url+strlen((char*)url)-1);
//printf("%s,%s,%d,%s,%d",out.protocol,out.domain,out.port,out.path,out.success);
return 0;
}
ERL_NIF_TERM parse_url(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary block;
ERL_NIF_TERM path, output;
struct domain out = {.port=80,.success=false};
if (enif_is_binary(env,argv[0])==false) return enif_make_atom(env, "error");
enif_inspect_binary(env, argv[0], &block);
if (block.size<4) return enif_make_atom(env, "error");
get_protocol(block.data,&out,block.data+block.size-1);
if (out.success==false) return enif_make_atom(env, "error");
//fprintf(stderr,"%d %d --
//",(int)(block.size-out.path-1),(int)out.path);
if (out.path==0){
unsigned char dat[1]={'/'};
ErlNifBinary bin={.size=1,.data=dat};
path=enif_make_binary(env,&bin);
}
else{
path=enif_make_sub_binary(env, argv[0], block.size-out.path , out.path);
}
output=enif_make_tuple4(env
, enif_make_binary(env, &out.protocol)
, enif_make_binary(env, &out.domain)
, enif_make_int(env,out.port)
, path
);
free(out.protocol.data);
free(out.domain.data);
return output;
}
inline void get_protocol(uint8_t* s, struct domain* out, uint8_t* len){
uint8_t* i, *protocol;
if ((i=memchr(s,':',len-s+1)) != NULL){
protocol = malloc(i-s);
memcpy(protocol,s,i-s);
//strncpy((char*)protocol,(char *)s,i-s);
lower(protocol,protocol+(i-s)+1);
out->protocol.size=i-s;
out->protocol.data=protocol;
// printf("%s\n",out->protocol);
return get_domain(i+3,out,len);
} else{
protocol=malloc(4);
strncpy((char*)protocol,"http",4);
out->protocol.size=4;
out->protocol.data=protocol;
return get_domain(s,out,len);
}
}
inline void get_domain(uint8_t* s, struct domain* out, uint8_t* len){
uint8_t* i=s, *domain;
while(*i != ':' && *i !='/' && i<len+1) i++;
if (i==s) return; //badarg
domain = malloc(i-s);
memcpy(domain,s,i-s);
// strncpy((char*)domain,(char *)s,i-s);
lower(domain,domain+(i-s)+1);
out->domain.size=i-s;
out->domain.data=domain;
// printf("%s\n",out->domain);
if (*i == '/'){
return get_path(i,out,len);
} else if(*i==':'){
return get_port(i+1,out,len);
} else if(i>len-1){
out->success=true;
return;
} else return; //badarg
}
inline void get_port(uint8_t* s, struct domain* out, uint8_t* len){
uint8_t* i=s, *port;
i=memchr(s,'/',len-s+1);
if (i==s || i-s>5) return; //badarg
if (i==NULL){
out->success=true;
return;
} else {
port=malloc(6);
strncpy((char*)port,(char *)s,i-s);
port[5]='\0';
out->port=atoi((char*)port);
free(port);
return get_path(i,out,len);
}
}
inline void get_path(uint8_t* s, struct domain* out, uint8_t* len){
out->path=len-s+1;
/* uint8_t *path; */
/* out->success=true; */
/* if (len-s==-1) return; */
/* path = malloc(len-s+1); */
/* strncpy((char*)path,(char *)s,len-s+1); */
out->success=true;
/* out->path.size=len-s+1; */
/* out->path.data=path; */
// printf("%s\n",out->path);
}
inline void lower(uint8_t* s,uint8_t* end){
while(s++<end) if (*s>'A'-1 && *s<'Z'+1) *s+=32;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment