运行mem-data-analysis-framework
下载:wget https://mem:xxxx@alcatraz.net.in.tum.de/mem-prototype/mem-data-analysis-framework.tar.gz
docker load <mem-data-analysis-framework.tar.gz
docker run --network="host" -t -d mem-prototype:latest
然后访问localhost:8080即可看到代码!
上传数据到docker:
sudo docker cp x.txt e66d9d295e54:/tmp/
[sudo] password for kali:
root@kali:/work# ls /tmp/
hsperfdata_root spark-events x.txt zeppelin-index
可以看到tmp下有数据了!
计算相似性,基于相似性通过KNN进行“聚类”:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | % spark.pyspark # Returns all edges between server behaviors where k versions match. # For the remaining client hellos without matches, one of the responses has to be a timeout. def get_core(versions, k): versions = versions.select(f.col( 'id' ).alias( 'id_a' ), f.col( 'versions' ).alias( 'v_a' ))\ .join(versions.select(f.col( 'id' ).alias( 'id_b' ), f.col( 'versions' ).alias( 'v_b' )), f.col( 'id_a' ) < = f.col( 'id_b' ))\ . filter (version_match(f.col( 'v_a' ), f.col( 'v_b' ), f.lit(k))) print (f 'threshold: {k}, version matches: {versions.count()}' ) return (versions) versions = df_only_fp.select(f.col( 'versions' ),f.col( 'timeouts' )).distinct().sort(f.col( 'versions' ).asc()).withColumn( "id" , f.monotonically_increasing_id()) versions.write.mode( 'overwrite' ).parquet(f '/tmp/versions.parquet' ) versions = spark.read.parquet(f '/tmp/versions.parquet' ) versions = versions. filter (f.col( 'timeouts' )< = 5 ) for i in range ( 0 , 9 ): tmp_versions = get_core(versions, 24 - i) edges = calculate_similarites(df_only_fp, tmp_versions) edges.write.mode( 'overwrite' ).parquet(f '/tmp/fp_distances_raw_{24-i}_new.parquet' ) edges = spark.read.parquet(f '/tmp/fp_distances_raw_{24-i}_new.parquet' ) % spark.pyspark window = Window.partitionBy( 'fp_a' ).orderBy(col( 'similarity' ).desc_nulls_last()) # The functions temprorarily store resutls for faster processing afterwards def generate_knndf(fp_distances, k, file_name): if k ! = 0 : knn_df = fp_distances.withColumn( 'row' , f.rank().over(window)).where(col( 'row' ) < = k).drop( 'row' ) else : knn_df = fp_distances knn_df.write.mode( 'overwrite' ).parquet(os.path.join( '/tmp/' , file_name)) return knn_df def generate_simple_knndf(knn_df, file_name): s_knn_df = knn_df.withColumn( 'fp_1' , f.when(col( 'fp_a' ) < col( 'fp_b' ), col( 'fp_a' )).otherwise(col( 'fp_b' ))).withColumn( 'fp_2' , f.when(col( 'fp_1' ) = = col( 'fp_b' ), col( 'fp_a' )).otherwise(col( 'fp_b' ))).drop( 'fp_a' ).drop( 'fp_b' ).withColumnRenamed( 'fp_1' , 'fp_a' ).withColumnRenamed( 'fp_2' , 'fp_b' ).distinct() s_knn_df.write.mode( 'overwrite' ).parquet(os.path.join( '/tmp/' , 's_' + file_name)) def generate_g_knndf(knn_df, file_name): gf_knn = GraphFrame(df_only_fp.select(col( 'fingerprint_all' ).alias( 'id' )), knn_df.select(col( 'fp_a' ).alias( 'src' ), col( 'fp_b' ).alias( 'dst' ))) g_knn_df = gf_knn.connectedComponents().select(col( 'id' ).alias( 'fingerprint_all' ), 'component' ) g_knn_df.write.mode( 'overwrite' ).parquet(os.path.join( '/tmp/' , 'g_' + file_name)) def generate(k): file_name = f '{k}_knn_df.parquet' knn_df = generate_knndf(fp_distances_raw, k, file_name) knn_df = spark.read.parquet(os.path.join( '/tmp/' , file_name)) generate_simple_knndf(knn_df, file_name) generate_g_knndf(knn_df, file_name) # Read k-NN graph files def get_knn(k): file_name = f '{k}_knn_df.parquet' if not os.path.exists(os.path.join( '/tmp/' , file_name)): raise ValueError() knn_df = spark.read.parquet(os.path.join( '/tmp/' , file_name)) if not os.path.exists(os.path.join( '/tmp/' , 's_' + file_name)): raise ValueError() s_knn_df = spark.read.parquet(os.path.join( '/tmp/' , 's_' + file_name)) if not os.path.exists(os.path.join( '/tmp/' , 's_' + file_name)): raise ValueError() g_knn_df = spark.read.parquet(os.path.join( '/tmp/' , 'g_' + file_name)) component_count = g_knn_df.select( 'component' ).distinct().count() return knn_df, s_knn_df, g_knn_df, component_count |
2-Analyze-FPs
输出处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | def read_parsed_scan(scan, label): file_dir = os.path. join (scan, "certs.parquet" ) certs = spark. read .parquet(file_dir) file_dir = os.path. join (scan, "fingerprints.parquet" ) fps = spark. read .parquet(file_dir) file_dir = os.path. join (scan, "http.parquet" ) http = spark. read .parquet(file_dir) file_dir = os.path. join (scan, "tls_verbose.parquet" ) tls = spark. read .parquet(file_dir) file_dir = os.path. join (scan, "input.parquet" ) labeled_input = spark. read .parquet(file_dir) file_dir = os.path. join (scan, "hosts.parquet" ) hosts = spark. read .parquet(file_dir) hosts = hosts.withColumn( 'label' , f.lit( 'label' )) hosts = hosts.fillna( "empty" , [ 'server_name' ]) return labeled_input, hosts, tls, fps, certs, http |
输入来源:
bl_labeled_input, bl_hosts, bl_tls, bl_fps, bl_certs, bl_http = read_parsed_scan("/data/blocklist-parsed", "blocklist")
tl_labeled_input, tl_hosts, tl_tls, tl_fps, tl_certs, tl_http = read_parsed_scan("/data/toplist-parsed", "toplist")
input路径:
/data/blocklist-parsed
/data/toplist-parsed
默认数据的输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | Top 5 behaviors from block lists (based on number of targets: +--------------------+-------+--------------+---------------------+--------------------+ | fingerprint_all|targets|Distinct Ports|Distinct IP addresses| collect_set(labels)| +--------------------+-------+--------------+---------------------+--------------------+ |771_c030_65281.-1...| 57| 4| 57|[[Dridex], [Emotet]]| |771_9c_65281.-35....| 51| 10| 49| [[QakBot]]| |771_c02f_65281.-1...| 38| 14| 38| [[Dridex]]| |771_c02f_65281.-1...| 36| 19| 36| [[Dridex]]| |769_c014_65281.-1...| 22| 2| 22| [[TrickBot]]| +--------------------+-------+--------------+---------------------+--------------------+ Top 5 behaviors from top lists (based on number of targets: +--------------------+-------+----------------+---------------------+ | fingerprint_all|targets|Distinct Domains|Distinct IP addresses| +--------------------+-------+----------------+---------------------+ |771_1301_51.29-43...| 272| 55| 260| |771_1301_43.AwQ-5...| 128| 24| 118| |771_1301_51.29-43...| 110| 31| 110| |771_1301_51.29-43...| 96| 25| 74| |771_1301_51.29-43...| 93| 38| 65| +--------------------+-------+----------------+---------------------+ |
上述数据来自第一步 1-Parse-Scan,里面的输入和输出我们看下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | def parse(input_dir, tmp_dir, out, toplist=False): # Load and save all input csv files as parquets with ThreadPool(processes=4) as pool: def process_file(t): file = os.path. join (input_dir, t[0]) try: df_tmp = load_csv( file , schema=t[1]) save_to_parquet(out, df_tmp, t[2], 4) except FileNotFoundError: logging.info(f 'Skipping {t}' ) except Exception as err: logging.exception( 'Could not save csv' , exc_info=err) jobs = [] r = pool.map_async(process_file, [ ( 'tls_verbose.csv' , TLS_VERBOSE_DF_SCHEMA, 'tls_verbose' ), ( 'http.csv' , HTTP_DF_SCHEMA, 'http' ), ( 'certs.csv' , CERTS_DF_SCHEMA, 'certs' ), ( 'labeled-input.csv' , INPUT_DF_SCHEMA, 'input_tmp' ) ], chunksize=1) jobs.append(r) hosts_csv = load_csv(os.path. join (input_dir, 'hosts.csv' ), schema=HOSTS_DF_SCHEMA) df_ip = hosts_csv. select ( 'ip' ).distinct().mapInPandas(get_map_pandas_add_as( os.path. join (input_dir.replace( 'file://' , '' ), '..' , 'pyasn.dat' ), os.path. join (input_dir.replace( 'file://' , '' ), '..' , 'pyasn.asnames.json' )), ADD_AS_SCHEMA) hosts_csv = hosts_csv. join (df_ip, on= 'ip' , how= 'left_outer' ) r = pool.apply_async(lambda: save_to_parquet(out, hosts_csv, 'hosts' , 4)) jobs.append(r) for p in jobs: p.get() # Reload for faster processing hosts_csv = spark. read .parquet(os.path. join (out, 'hosts.parquet' )) input_csv = spark. read .parquet(os.path. join (out, 'input_tmp.parquet' )) tls_csv = spark. read .parquet(os.path. join (out, 'tls_verbose.parquet' )) # Join input for scan ids if not toplist: input_csv = input_csv. join (hosts_csv. select ( 'id' , 'server_name' , 'ip' , 'port' ), (input_csv.ip.eqNullSafe(hosts_csv.ip)) & (input_csv.server_name.eqNullSafe(hosts_csv.server_name)) & (input_csv.port.eqNullSafe(hosts_csv.port)))\ . select (input_csv.ip, input_csv.server_name, input_csv.rank, input_csv.label, input_csv.port, input_csv.list, hosts_csv. id ).distinct() else : input_csv = input_csv. join (hosts_csv. select ( 'id' , 'server_name' , 'ip' , 'port' ), (input_csv.server_name.eqNullSafe(hosts_csv.server_name)))\ . select (hosts_csv.ip, input_csv.server_name, input_csv.rank, input_csv.label, hosts_csv.port, input_csv.list, hosts_csv. id ).distinct() save_to_parquet(out, input_csv, 'input' , 4) splittext = f.udf(lambda FullPath: FullPath. split ( '.' )[0], StringType()) # Compute TLS Fingerprints hosts_csv = hosts_csv.fillna( 'default' , subset= 'client_hello' ).withColumn( 'client_hello_simple' , splittext( "client_hello" )) hosts_csv= hosts_csv.drop( 'client_hello' ).withColumnRenamed( 'client_hello_simple' , 'client_hello' ) pivot_src_df = hosts_csv. join (tls_csv, on= 'id' ). select ( 'id' , 'ip' , 'port' , 'server_name' , 'client_hello' , 'fingerprint' ) # Compute combined Fingerprints client_hellos_escaped = sorted(pivot_src_df. select ( 'client_hello' ).distinct().rdd.map(lambda r: r[0]).collect()) fingerprint_df = pivot_src_df.groupBy( 'ip' , 'port' , 'server_name' ).pivot( 'client_hello' , client_hellos_escaped).agg(f.first( 'fingerprint' )) fingerprint_df = fingerprint_df.withColumn( 'fingerprint_all' , f.array_join(remove_status_request(f.array(*client_hellos_escaped)), delimiter= '|' , null_replacement= '______<255' )) #fingerprint_df = add_fingerprint_col(fingerprint_df, 'fingerprint_all', client_hellos) # Save FP joined save_to_parquet(out, fingerprint_df, 'fingerprints' , 4, partition_columns=[ 'ip' , 'port' ]) |
parse("/data/blocklist", "/tmp", "/data/blocklist-parsed")
parse("/data/toplist", "/tmp", "/data/toplist-parsed", toplist=True)
看来输入是在/data里,输出刚好是/data/blocklist-parsed和/data/toplist-parsed
下面是我修改 blocklist-parsed 为blocklist-parsed2后运行的结果:
/data# ls
blocklist blocklist-parsed2 pyasn.asnames.json toplist toplist-parsed2
blocklist-parsed blocklist.zip pyasn.dat toplist-parsed toplist.zip
进入docker bash里去看看数据(322c74ee1119通过docker ps看到):
1 | sudo docker exec -it 322c74ee1119 /bin/bash |
我们看看blocklist里内容:
1 2 3 | /data/blocklist # ls blocklist.input certs.csv http.csv labeled-input.csv tls_verbose.csv cert_chain.csv hosts.csv http_verbose.csv log |
看看hosts里内容,一共5万多数据:
看看toplist里内容:
1 2 3 | /data/toplist # ls alexa.input certs.csv http.csv labeled-input.csv tls_verbose.csv cert_chain.csv hosts.csv http_verbose.csv log |
hosts文件内容,一共6万多:
第三部分的运行效果:
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
2021-08-06 凸图形——就是六边形这种,向五角星就是非凸的
2021-08-06 人工智能 kmeans和som的简单比较——线性可分的数据还可以,但都不擅长处理圆分割数据,因为用的欧几里得距离?
2021-08-06 聚类算法:ISODATA算法 ——kmeans算法升级版,不知道k也可以,但是需要你自己指定其他参数也很蛋疼
2021-08-06 k-means++算法选择初始seeds的基本思想就是:初始的聚类中心之间的相互距离要尽可能的远——sklearn内置该算法