Skip to content

Instantly share code, notes, and snippets.

@ashim888
Last active July 6, 2016 11:05
Show Gist options
  • Save ashim888/797cae73c921e282a56db0dc477455e1 to your computer and use it in GitHub Desktop.
Save ashim888/797cae73c921e282a56db0dc477455e1 to your computer and use it in GitHub Desktop.

Revisions

  1. ashim888 revised this gist Jul 6, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion safety.py
    Original file line number Diff line number Diff line change
    @@ -23,7 +23,7 @@ def safety_check(domain):
    language = language_prediction[0]

    # TRANSLATE LANGUAGE
    translator = Translator('ashim888_translator', 'TPwDatIXNxEasFBxSpLJ/coozTnml/4NaWHiyHVavRQ=')
    translator = Translator('<Your Client ID>', '<Your Client Secret>')
    if language!='en':
    print 'Another language Found: '+ language
    overall_text= translator.translate(overall_text, "en")
  2. ashim888 revised this gist Jul 6, 2016. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion safety.py
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,6 @@

    porn_list=["anal","hentai","anus","arse","butt","arsehole","ass","fcuk","fuck","naked","xvideos","porn", "sex", "porno", "free porn", "porn tube", "porn videos", "streaming porn","Free porn", "sex videos","pussy","Porn hub", "xxx" "porn", "sex" ]
    def safety_check(domain):
    count=0
    tokenizer = RegexpTokenizer(r'\w+')
    client = MongoClient('mongodb://192.168.1.10:27017/',27017)
    db = client.cutestat_v3
  3. ashim888 created this gist Jul 6, 2016.
    61 changes: 61 additions & 0 deletions safety.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    from pymongo import MongoClient
    import pprint
    from nltk.corpus import stopwords
    from nltk.tokenize import RegexpTokenizer
    from microsofttranslator import Translator
    import langid

    porn_list=["anal","hentai","anus","arse","butt","arsehole","ass","fcuk","fuck","naked","xvideos","porn", "sex", "porno", "free porn", "porn tube", "porn videos", "streaming porn","Free porn", "sex videos","pussy","Porn hub", "xxx" "porn", "sex" ]
    def safety_check(domain):
    count=0
    tokenizer = RegexpTokenizer(r'\w+')
    client = MongoClient('mongodb://192.168.1.10:27017/',27017)
    db = client.cutestat_v3
    try:
    cursor = db.WebInfo.find_one({"domain":domain})
    if cursor!=None:
    stop = stopwords.words('english')
    overall_text=str(cursor['title'])+str(cursor['metaDescription']) + str(cursor['metaTags'])
    # Language Detect
    language_prediction=langid.classify(overall_text)
    if language_prediction!=None:
    language = language_prediction[0]

    # TRANSLATE LANGUAGE
    translator = Translator('ashim888_translator', 'TPwDatIXNxEasFBxSpLJ/coozTnml/4NaWHiyHVavRQ=')
    if language!='en':
    print 'Another language Found: '+ language
    overall_text= translator.translate(overall_text, "en")

    overall_text=set([i.lower() for i in tokenizer.tokenize(overall_text) if i not in stop])
    # check if any key matches with each other
    count=overall_text.intersection(set(porn_list))

    if len(count)>0:
    print domain+" SAFETY CHECK FAIL"
    print "Total Abusive Keywords Found:", len(count)
    print '\n'
    else:
    print domain+" SAFETY CHECK PASS"
    print '\n'
    else:
    print domain + ' Not Found In Database' +'\n'

    except TypeError as exc:
    print domain + " Not found"
    except UnboundLocalError as exc:
    print exc
    except Exception as exc:
    print exc

    safety_check("www.befuck.com")
    safety_check("baidu.com")
    safety_check("gioia.it")
    safety_check("partyporn.co.il")
    safety_check("x-nxx.co.il")
    safety_check("jw.org")
    safety_check("xhamster.com")
    safety_check("www.xnxx.com")
    safety_check("ratopati.com")