大数据商品项目笔记
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
def Feat(priors:DataFrame,orders:DataFrame): DataFrame ={ | /** | * product feature: | * 1. 销售量 prod_cnt | * 2. 商品被再次购买(reordered)量 prod_sum_rod | * 3. 统计reordered比率 prod_rod_rate | */ | // 统计销售量 | val productCnt = priors.groupBy("product_id").count() | | // 统计商品被再次购买量:sum("reordered"),统计reordered比率:avg("reordered") | val productRodCnt = priors.selectExpr("product_id", "cast(reordered as int)") | .groupBy("product_id") | .agg(sum("reordered").as("prod_sum_rod") , avg("reordered").as("prod_rod_rate")) | | val productFeat = productCnt.join(productRodCnt,"product_id") | .selectExpr("product_id", | "count as prod_cnt", | "prod_sum_rod", | "prod_rod_rate") | | /** | * user Features: | * 1.每个用户平均购买订单的间隔周期 user_avg_day_gap | * 2.每个用户的总订单数量 | * 3.每个用户购买的product商品去重后的集合数据 | * 4.每个用户总商品数量以及去重后的商品数量总商品数量 | * 5.每个用户购买的平均每个订单的商品数量 | * + | * 6. user | */ | // 异常值处理:将days_since_prior_order中的空值进行处理 | val ordersNew =orders | .selectExpr("*","if(days_since_prior_order='',0,days_since_prior_order) as dspo") | .drop("days_since_prior_order") | // 1.每个用户平均购买订单的间隔周期:avg("dspo") | val userGap = ordersNew.selectExpr("user_id","cast(dspo as int)") | .groupBy("user_id") | .avg("dspo") | .withColumnRenamed("avg(dspo)","user_avg_day_gap") | // 2.每个用户的总订单数量 | val userOrdCnt = orders.groupBy("user_id").count() | // 3.每个用户购买的product商品去重后的集合数据 | val op = orders.join(priors,"order_id").select("user_id","product_id") | | // RDD转DataFrame:需要隐式转换 toDF | import priors.sparkSession.implicits._ | | val userUniOrdRecs=op.rdd.map(x=>(x(0).toString,x(1).toString)) | .groupByKey() | .mapValues(_.toSet.mkString(",")) | .toDF("user_id","product_records") | | // 4. 每个用户总商品数量以及去重后的商品数量 | // 可以将3和4进行合并:同时取到product去重的集合和集合的大小 | val userProRcdSize=op.rdd.map(x=>(x(0).toString,x(1).toString)).groupByKey().mapValues{record=> | val rs = record.toSet | (rs.size,rs.mkString(",")) | }.toDF("user_id","tuple") | .selectExpr("user_id","tuple._1 as prod_dist_cnt","tuple._2 as prod_records") | // 5.每个用户购买的平均每个订单的商品数量 | // 1)先求每个订单的商品数量【对订单做聚合count()】 | val ordProCnt = priors.groupBy("order_id").count() | // 2)求每个用户订单中商品个数的平均值【对user做聚合,avg(商品个数)】 | val userPerOrdProdCnt = orders.join(ordProCnt,"order_id") | .groupBy("user_id") | .avg("count") | .withColumnRenamed("avg(count)","user_avg_ord_prods") | | val userFeat = userGap.join(userOrdCnt,"user_id") | .join(userProRcdSize,"user_id") | .join(userPerOrdProdCnt,"user_id") | .selectExpr("user_id", | "user_avg_day_gap", | "count as user_ord_cnt", | "prod_dist_cnt as user_prod_dist_cnt", | "prod_records as user_prod_records", | "user_avg_ord_prods") | orders.join(priors,"order_id") | .select("user_id","product_id","eval_set") | .join(productFeat,"product_id") | .join(userFeat,"user_id") | }
scala> val orders = spark.sql("select * from badou.orders") orders: org.apache.spark.sql.DataFrame = [order_id: string, user_id: string ... 5 more fields]
val priors = spark.sql("select * from badou.priors") scala> val dfFeat = Feat(priors,orders) dfFeat: org.apache.spark.sql.DataFrame = [user_id: string, product_id: string ... 9 more fields]
scala> dfFeat.show() +-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+ |user_id|product_id|eval_set|prod_cnt|prod_sum_rod| prod_rod_rate|user_avg_day_gap|user_ord_cnt|user_prod_dist_cnt| user_prod_records|user_avg_ord_prods| +-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+ | 100010| 48370| prior| 3934| 2751| 0.699288256227758| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 48370| prior| 3934| 2751| 0.699288256227758| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 48370| prior| 3934| 2751| 0.699288256227758| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 48370| prior| 3934| 2751| 0.699288256227758| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 48370| prior| 3934| 2751| 0.699288256227758| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 26079| prior| 406| 236| 0.5812807881773399| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 169| prior| 1877| 1016| 0.5412892914224827| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 33129| prior| 12196| 6436| 0.5277140045916694| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 22285| prior| 276| 196| 0.7101449275362319| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 14811| prior| 249| 110|0.44176706827309237| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 31506| prior| 50255| 23954| 0.4766490896428216| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 31506| prior| 50255| 23954| 0.4766490896428216| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 21616| prior| 72829| 49798| 0.6837660821925332| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 40516| prior| 12495| 5867|0.46954781912765103| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 40516| prior| 12495| 5867|0.46954781912765103| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 40516| prior| 12495| 5867|0.46954781912765103| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 40516| prior| 12495| 5867|0.46954781912765103| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 40516| prior| 12495| 5867|0.46954781912765103| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 16168| prior| 4593| 2568| 0.5591116917047682| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| | 100010| 44142| prior| 43009| 23609| 0.5489316189634728| 6.5| 12| 119|22285,48825,44325...|18.727272727272727| +-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+ only showing top 20 rows
scala> val dfFeat = Feat(priors,orders) <console>:35: error: not found: value priors val dfFeat = Feat(priors,orders) ^ scala> val dfFeat = Feat(priors,orders) <console>:35: error: not found: value priors val dfFeat = Feat(priors,orders) ^ scala> val priors = spark.sql("select * from badou.priors") priors: org.apache.spark.sql.DataFrame = [order_id: string, product_id: string ... 2 more fields] scala> val dfFeat = Feat(priors,orders) dfFeat: org.apache.spark.sql.DataFrame = [user_id: string, product_id: string ... 9 more fields] scala> dfFeat.show()
备注:
标签数据: orders, eval_set prior train test train作为标签数据 标签数据表示下一个订单中已经产生购买了 如果在train里面的,表示label为1,否则为0 用LR逻辑回归做什么事情呢? 预测其他的商品对于这个用户在下个订单中的概率(打分) p(l) 最终目标预测一个用户下一个订单会买什么? 给我们是这个订单已经购买了一些商品,剩下的商品靠预测。 1-10,预测第11订单的商品会是什么? 1-9作为训练,10作为预测 1-9产生一个model,用这个模型预测10中会有哪些商品,模型预测:商品集合1 实际第10个订单:实际的商品集合2 orders:用户产生订单的数据,evel_set=prior, train, test products:产品信息的数据 priors:订单对应的产品 ,有3000万条数据 trains:订单对应的产品,作为最后一个订单里面的商品 特征 feature feat
create table products ( product_id string, product_name string, aisle_id string, department_id string ) row format delimited fields terminated by ',' lines terminated by '\n'; order_id,product_id,add_to_cart_order,reordered create table trains ( order_id string, product_id string, add_to_cart_order string, reordered string ) row format delimited fields terminated by ',' lines terminated by '\n';
import org.apache.spark.ml.feature.OneHotEncoder val oh = new OneHotEncoder().setInputCol("order_dow").setOutputCol("dow") val ohDf=oh.transform(orders.selectExpr("cast(order_dow as double)"))
########## 今天的苦逼是为了不这样一直苦逼下去!##########