xss  多分类 优选 贝叶斯、逻辑回归、决策树

复制代码
import re
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.externals import  joblib
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import linear_model 
from sklearn.naive_bayes import GaussianNB

from sklearn import cross_validation
from sklearn import tree

x = []
y = []

def get_len(url):
    return len(url)

def get_url_count(url):
    if re.search('(http://)|(https://)', url, re.IGNORECASE) :
        return 1
    else:
        return 0

def get_evil_char(url):
    return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))

def get_evil_word(url):
    return len(re.findall("(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)",url,re.IGNORECASE))

def get_last_char(url):
    if re.search('/$', url, re.IGNORECASE) :
        return 1
    else:
        return 0

def get_feature(url):
    return [get_len(url),get_url_count(url),get_evil_char(url),get_evil_word(url),get_last_char(url)]


def etl(filename,data,isxss):
        with open(filename) as f:
            for line in f:
                f1=get_len(line)
                f2=get_url_count(line)
                f3=get_evil_char(line)
                f4=get_evil_word(line)
                data.append([f1,f2,f3,f4])
                if isxss:
                    y.append(1)
                else:
                    y.append(0)
        return data

etl('../data/xss-200000.txt',x,1)
etl('../data/good-xss-200000.txt',x,0)

clf = tree.DecisionTreeClassifier()
clf2 = svm.SVC(kernel='linear', C=1)
clf3 = linear_model.LogisticRegression(C=1e5)
clf4 = GaussianNB()

for name,clf in {"decision tree":clf, "svm train long time":clf2, "LR":clf3, "bayes":clf4}.iteritems():
#for name,clf in {"decision tree":clf, "LR":clf3, "bayes":clf4}.iteritems():
    print "model:", name
    score=cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=5)
    print score
    print  np.mean(score)
复制代码

 

posted @   bonelee  阅读(732)  评论(1编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
历史上的今天:
2016-11-11 elasticsearch 自定义_id
2016-11-11 自定义实现spark的分区函数
2016-11-11 elasticsearch 路由文档到分片
2016-11-11 spark streaming 入门例子
2016-11-11 ElasticSearch-hadoop saveToEs源码分析
点击右上角即可分享
微信分享提示