Skip to content

Instantly share code, notes, and snippets.

@myzie
Created February 20, 2024 01:44
Show Gist options
  • Save myzie/601cf170f88b09efd2012ea322a80259 to your computer and use it in GitHub Desktop.
Save myzie/601cf170f88b09efd2012ea322a80259 to your computer and use it in GitHub Desktop.
Risor Web Crawler
package main
import (
"fmt"
"github.com/risor-io/risor/object"
"github.com/risor-io/risor/op"
)
const CrawlerType object.Type = "crawler.crawler"
type CrawlerObject struct {
value *Crawler
}
func (c *CrawlerObject) Type() object.Type {
return CrawlerType
}
func (c *CrawlerObject) Inspect() string {
return "crawler.crawler()"
}
func (c *CrawlerObject) Interface() interface{} {
return c.value
}
func (c *CrawlerObject) IsTruthy() bool {
return true
}
func (c *CrawlerObject) Cost() int {
return 0
}
func (c *CrawlerObject) MarshalJSON() ([]byte, error) {
return nil, fmt.Errorf("type error: unable to marshal crawler.crawler")
}
func (c *CrawlerObject) RunOperation(opType op.BinaryOpType, right object.Object) object.Object {
return object.Errorf("eval error: unsupported operation for %s: %v", CrawlerType, opType)
}
func (c *CrawlerObject) Equals(other object.Object) object.Object {
return object.NewBool(c == other)
}
func (c *CrawlerObject) SetAttr(name string, value object.Object) error {
switch name {
case "response":
value, err := object.AsString(value)
if err != nil {
return err.Value()
}
c.value.Response = value
return nil
case "status":
value, err := object.AsInt(value)
if err != nil {
return err.Value()
}
c.value.Status = int(value)
return nil
default:
return fmt.Errorf("attribute error: %s object has no attribute %q", CrawlerType, name)
}
}
func (c *CrawlerObject) GetAttr(name string) (object.Object, bool) {
switch name {
case "response":
return object.NewString(c.value.Response), true
case "status":
return object.NewInt(int64(c.value.Status)), true
}
return nil, false
}
func NewCrawlerObject(c *Crawler) *CrawlerObject {
return &CrawlerObject{value: c}
}
package main
import (
"context"
"fmt"
"github.com/risor-io/risor/object"
"github.com/risor-io/risor/op"
)
const CrawlerRegistryType object.Type = "crawler.registry"
type CrawlerRegistryObject struct {
value *CrawlerRegistry
}
func (r *CrawlerRegistryObject) Type() object.Type {
return CrawlerRegistryType
}
func (r *CrawlerRegistryObject) Inspect() string {
return "crawler.registry()"
}
func (r *CrawlerRegistryObject) Interface() interface{} {
return r.value
}
func (r *CrawlerRegistryObject) IsTruthy() bool {
return true
}
func (r *CrawlerRegistryObject) Cost() int {
return 0
}
func (r *CrawlerRegistryObject) MarshalJSON() ([]byte, error) {
return nil, fmt.Errorf("type error: unable to marshal crawler.registry")
}
func (r *CrawlerRegistryObject) RunOperation(opType op.BinaryOpType, right object.Object) object.Object {
return object.Errorf("eval error: unsupported operation for %s: %v", CrawlerRegistryType, opType)
}
func (r *CrawlerRegistryObject) Equals(other object.Object) object.Object {
return object.NewBool(r == other)
}
func (r *CrawlerRegistryObject) SetAttr(name string, value object.Object) error {
return fmt.Errorf("attribute error: %s object has no attribute %q", CrawlerRegistryType, name)
}
func (r *CrawlerRegistryObject) GetAttr(name string) (object.Object, bool) {
switch name {
case "register":
return object.NewBuiltin("crawler.registry.register", func(ctx context.Context, args ...object.Object) object.Object {
if len(args) != 2 {
return object.Errorf("wrong number of arguments. got=%d, want=2", len(args))
}
name, errObj := object.AsString(args[0])
if errObj != nil {
return errObj
}
fn, ok := args[1].(*object.Function)
if !ok {
return object.Errorf("argument error: expected function, got %s", args[1].Type())
}
callFunc, ok := object.GetCallFunc(ctx)
if !ok {
return object.Errorf("unable to get call function")
}
r.value.Register(name, func(crawler *Crawler, query string) error {
c := NewCrawlerObject(crawler)
_, err := callFunc(ctx, fn, []object.Object{c, object.NewString(query)})
return err
})
return object.Nil
}), true
case "call":
return object.NewBuiltin("crawler.registry.call", func(ctx context.Context, args ...object.Object) object.Object {
if len(args) != 2 {
return object.Errorf("wrong number of arguments. got=%d, want=2", len(args))
}
name, errObj := object.AsString(args[0])
if errObj != nil {
return errObj
}
query, errObj := object.AsString(args[1])
if errObj != nil {
return errObj
}
crawler, err := r.value.Call(name, query)
if err != nil {
return object.Errorf("crawler error: %v", err)
}
return NewCrawlerObject(crawler)
}), true
}
return nil, false
}
func NewCrawlerRegistryObject(r *CrawlerRegistry) *CrawlerRegistryObject {
return &CrawlerRegistryObject{value: r}
}
package main
import (
"context"
"flag"
"fmt"
"os"
"sync"
"github.com/risor-io/risor"
)
type CrawlerFunc func(crawler *Crawler, query string) error
type Crawler struct {
Response string
Status int
}
type CrawlerRegistry struct {
crawlers map[string]CrawlerFunc
mutex sync.RWMutex
}
func NewCrawlerRegistry() *CrawlerRegistry {
return &CrawlerRegistry{
crawlers: make(map[string]CrawlerFunc),
}
}
func (cr *CrawlerRegistry) Register(name string, callback CrawlerFunc) {
cr.mutex.Lock()
defer cr.mutex.Unlock()
cr.crawlers[name] = callback
}
func (cr *CrawlerRegistry) Call(name string, query string) (*Crawler, error) {
cr.mutex.RLock()
defer cr.mutex.RUnlock()
if callback, ok := cr.crawlers[name]; ok {
crawler := &Crawler{}
if err := callback(crawler, query); err != nil {
return nil, err
}
return crawler, nil
}
return nil, fmt.Errorf("crawler not found: %s", name)
}
var defaultScript = `
print("script running...")
registry.register("google", func(crawler, query) {
response := fetch("https://www.google.com/search?q=" + query)
crawler.response = response.text()
crawler.status = response.status_code
printf("crawl complete for \"%s\" (status: %d)\n", query, crawler.status)
});
print("crawling...")
result := registry.call("google", "animals")
print("status:", result.status, "response len:", len(result.response))
`
func main() {
var script string
flag.StringVar(&script, "script", defaultScript, "path to the script file")
flag.Parse()
app := NewCrawlerRegistry()
ctx := context.Background()
_, err := risor.Eval(ctx, script, risor.WithGlobals(map[string]interface{}{
"registry": NewCrawlerRegistryObject(app),
}))
if err != nil {
fmt.Println(err)
os.Exit(1)
}
}
@marianvlad
Copy link

Running .Call() inside go routines give these errors:

var defaultScript = `
print("script running...")

registry.register("google", func(crawler, query) {
	response := http.get("https://www.google.com/search?q=" + query).send()
	//crawler.response = response.text()
	crawler.status = 404
	printf("crawl complete for \"%s\" (status: %d)\n", query, crawler.status)
});

print("crawling...")

result := registry.call("google", "animals")
print("status:", result.status, "response len:", len(result.response))
`

func main() {
	var script string
	flag.StringVar(&script, "script", defaultScript, "path to the script file")
	flag.Parse()

	app := NewCrawlerRegistry()

	ctx := context.Background()

	_, err := risor.Eval(ctx, script, risor.WithGlobals(map[string]interface{}{
		"registry": NewCrawlerRegistryObject(app),
	}))
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}

	go func() {
		fmt.Println(app.Call("google", "apple"))
	}()

	go func() {
		fmt.Println(app.Call("google", "vegetable"))
	}()

	time.Sleep(5 * time.Second)
}
script running...
crawling...
crawl complete for "animals" (status: 404)
status: 404 response len: 0
<nil> exec error: attribute "send" not found on string object
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x2 addr=0x58 pc=0x1003b4c20]

goroutine 8 [running]:
github.com/risor-io/risor/object.AsString({0x0?, 0x0?})
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/object/typeconv.go:67 +0x120
github.com/risor-io/risor/modules/http.Module.MethodCmd.func2({0x100562280?, 0x1005acec0?}, {0x14000240020, 0x1, 0x11?})
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/modules/http/http.go:41 +0x80
github.com/risor-io/risor/object.(*Builtin).Call(0x100865e80?, {0x1005cce00?, 0x14000192a50?}, {0x14000240020?, 0x140000739c8?, 0x10045cc50?})
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/object/builtin.go:52 +0x44
github.com/risor-io/risor/vm.(*VirtualMachine).call(0x14000200000, {0x1005cce00, 0x14000192a50}, {0x1005cfa58?, 0x1400003b780}, 0x1)
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/vm/vm.go:663 +0x308
github.com/risor-io/risor/vm.(*VirtualMachine).eval(0x14000200000, {0x1005cce00, 0x14000192a50})
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/vm/vm.go:312 +0x40d8
github.com/risor-io/risor/vm.(*VirtualMachine).callFunction(0x14000200000, {0x1005cce00, 0x14000192a50}, 0x140001ac000, {0x140000a0d20, 0x2, 0x1?})
	/Users/gabi/go/pkg/mod/github.com/risor-io/[email protected]/vm/vm.go:771 +0x1e8
main.(*CrawlerRegistryObject).GetAttr.func1.1(0x140000ca7f8, {0x100463d24, 0x5})
	/Users/gabi/go/src/risor_testing/crawler_registry.go:74 +0x13c
main.(*CrawlerRegistry).Call(0x1005cce38?, {0x1004640a9, 0x6}, {0x100463d24, 0x5})
	/Users/gabi/go/src/risor_testing/main.go:43 +0x120
main.main.func1()
	/Users/gabi/go/src/risor_testing/main.go:86 +0x38
created by main.main in goroutine 1
	/Users/gabi/go/src/risor_testing/main.go:85 +0x2b4
exit status 2

--------

script running...
crawling...
crawl complete for "animals" (status: 404)
status: 404 response len: 0
<nil> exec error: attribute "send" not found on string object
<nil> type error: expected a string (builtin given)

But with cr.mutex.Lock() defer cr.mutex.Unlock() in Call() instead RLock and RUnlock, the code works as expected. Also removing locks give errors. Same thing with fetch() instead http. I don't know if is golang specific error or Risor...

@myzie
Copy link
Author

myzie commented Feb 23, 2024

Risor VMs are safe for use by one goroutine only. In the v1.4.0 of Risor that I just released, I think the Call() function will error with your code because it checks if the VM is already active and raises an error if it is.

This new release of Risor adds support for the go and defer keywords as well as channels. You might want to try moving the concurrency into the Risor script now using these mechanisms. Or use the vm.Clone() method to create a copy of the VM per goroutine in Go.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment