最近嘗試了用TensorFlow 的wide&deep模型做了 kaggle 的一個 CTR 點擊預測比賽,效果還不錯,最後應該可以排到前一百,不過這個比賽是一個4年前的比賽了,主要是貼一貼思路和代碼,代碼改寫自官方的 wide&deep 示例中的 census 模型代碼。

原有的特徵和生成新的特徵,具體生成了哪些新的特徵就不一一說明瞭,看代碼吧。

#coding:utf-8

import collections,csv,sys,pickle,os,time
import numpy as np

_CSV_COLUMNS = [id,
click,
hour,
C1,
banner_pos,
site_id,
site_domain,
site_category,
app_id,
app_domain,
app_category,
device_id,
device_ip,
device_model,
device_type,
device_conn_type,
C14,
C15,
C16,
C17,
C18,
C19,
C20,
C21]
NEW_CSV_COLUMNS = _CSV_COLUMNS + [device_id_cnt, device_ip_cnt,
usr_cnt, usr_hour_cnt, new_usr,

usr_day_count, usr_day_count_unique, usr_hour_adid_count,
usr_hour_adgroup_count,
usr_day_adid_count, usr_day_adgroup_count,
usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,
]
NEW_CSV_COLUMNS_hour = NEW_CSV_COLUMNS + [mday, wday, weekend, tmhour, night, imgsize,
# device_id_2, device_ip_2
]

_CSV_COLUMNS_test = [id,
hour,
C1,
banner_pos,
site_id,
site_domain,
site_category,
app_id,
app_domain,
app_category,
device_id,
device_ip,
device_model,
device_type,
device_conn_type,
C14,
C15,
C16,
C17,
C18,
C19,
C20,
C21]
NEW_CSV_COLUMNS_test = _CSV_COLUMNS_test + [device_id_cnt, device_ip_cnt,
usr_cnt, usr_hour_cnt, new_usr,

usr_day_count, usr_day_count_unique, usr_hour_adid_count,
usr_hour_adgroup_count,
usr_day_adid_count, usr_day_adgroup_count,
usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,
]

NEW_CSV_COLUMNS_test_hour = NEW_CSV_COLUMNS_test + [mday, wday, weekend, tmhour, night, imgsize,
# device_id_2, device_ip_2
]

# 設備 ID 空欄位處理
def def_usr(row):
if row[device_id] == a99f214a:
usr = ip- + row[device_ip] + row[device_model]
else:
# print(row[device_id])
usr = id- + row[device_id]

return usr

def scan(path):
for i, row in enumerate(csv.DictReader(open(path)),start=1):

usr = def_usr(row)

id_count[row[device_id]] += 1
ip_count[row[device_ip]] += 1
usr_count[usr] += 1
usr_hour_count[usr +-+ row[hour]] += 1 # number of impressions to the user in the hour

# part 3 ,add at 20190410
usr_day_count[usr +-+ row[hour][0:6]] += 1 # number of impressions to the user in the day
usr_day_count_unique[usr].add(row[hour][0:6]) # number of days user appeared

usr_hour_adid_count[usr +-+ row[C14]+-+ row[hour]] += 1 # number of impressions to the user for the adid in the hour
usr_hour_adgroup_count[usr +-+ row[C17]+-+ row[hour]] += 1 # number of impressions to the user for the adgroup in the hour
usr_day_adid_count[usr +-+ row[C14]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the adid in the day
usr_day_adgroup_count[usr +-+ row[C17]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the adgroup in the day

usr_day_appid_count[usr +-+ row[app_id]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the appid in the day
usr_day_siteid_count[usr +-+ row[site_id]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the siteid in the day

hour0 = hour_record.get(usr, 01010101)
hour0 = time.strptime(hour0, %y%m%d%H)
hour1 = time.strptime(row[hour], %y%m%d%H)
usr_last_visit_diff[usr +-+ row[hour]] = 24*(hour1.tm_yday - hour0.tm_yday) + (hour1.tm_hour - hour0.tm_hour) # interval of visit time
hour_record.update({usr: row[hour]})

if i%100000 == 0:
print(i, row)
print(hour0, hour1, usr_last_visit_diff[usr +-+ row[hour]])

def gen_data_3rd(src_path, dst_path, train=True):
reader = csv.DictReader(open(src_path))
if train:
# writer = csv.DictWriter(open(dst_path, w, newline=), NEW_CSV_COLUMNS_hour)
writer = csv.DictWriter(open(dst_path, w), NEW_CSV_COLUMNS_hour)
else:
# writer = csv.DictWriter(open(dst_path, w, newline=), NEW_CSV_COLUMNS_test_hour)
writer = csv.DictWriter(open(dst_path, w), NEW_CSV_COLUMNS_test_hour)
writer.writeheader()

for i, row in enumerate(reader, start=1):
if i % 100000 == 0:
print(i, row)

new_row = {}

if train:
for field in _CSV_COLUMNS:
new_row[field] = row[field]
else:
for field in _CSV_COLUMNS_test:
new_row[field] = row[field]

# part 1 feature
usr = def_usr(row)
new_row[device_id_cnt] = id_count[row[device_id]]
new_row[device_ip_cnt] = ip_count[row[device_ip]]
new_row[usr_cnt] = usr_count[usr]
new_row[usr_hour_cnt] = usr_hour_count[usr + - + row[hour]] # 這裡bu使用了str

new_row[new_usr] = usr

# part 2 feature
hour = time.strptime(new_row[hour], %y%m%d%H)
new_row[mday] = hour.tm_mday
new_row[wday] = hour.tm_wday
new_row[tmhour] = hour.tm_hour
new_row[weekend] = int(hour.tm_wday == 5 or hour.tm_wday == 6)

new_row[night] = int(hour.tm_hour in [20, 21, 22, 23, 24, 1, 2, 3, 4, 5, 6, 7, 8])
new_row[imgsize] = new_row[C15] + * + new_row[C16]
# new_row[device_id_2] = new_row[device_id][0:4]
# new_row[device_ip_2] = new_row[device_ip][0:4]

# part 3 feature
new_row[usr_day_count] = usr_day_count[usr +-+ row[hour][0:6]]
new_row[usr_day_count_unique] = len(usr_day_count_unique[usr])
new_row[usr_hour_adid_count] = usr_hour_adid_count[usr + - + row[C14] + - + row[hour]]
new_row[usr_hour_adgroup_count] = usr_hour_adgroup_count[usr + - + row[C17] + - + row[hour]]
new_row[usr_day_adid_count] = usr_day_adid_count[usr + - + row[C14] + - + row[hour][0:6]]
new_row[usr_day_adgroup_count] = usr_day_adgroup_count[usr + - + row[C17] + - + row[hour][0:6]]
new_row[usr_day_appid_count] = usr_day_appid_count[usr + - + row[app_id] + - + row[hour][0:6]]
new_row[usr_day_siteid_count] = usr_day_siteid_count[usr + - + row[site_id] + - + row[hour][0:6]]
new_row[usr_last_visit_diff] = usr_last_visit_diff[usr + - + row[hour]]

if i % 100000 == 0:
print(new, i, new_row)
writer.writerow(new_row)

# split train data
def newdata(path):
trainf = open(os.path.join(base_path, ctr_train4), w)
valf = open(os.path.join(base_path, ctr_val4), w)
for t, line in enumerate(open(path)):
temp = np.random.rand(1)
if temp<0.01 and t>0:
valf.write(line)
elif t>0:
trainf.write(line)
else:
print (t,line)

if t % 100000 == 0:
print(t, line)
print(t)
trainf.close()
valf.close()

# remove header
def newdatatest(path):
testf = open(os.path.join(base_path, ctr_test4), w)
for t, line in enumerate(open(path)):
if t>0:
testf.write(line)
# valf.write(line)
else:
print (t,line)

if t % 100000 == 0:
print(t, line)
print(t)
testf.close()

if __name__ == "__main__":

base_path = r/home/lon/zl/ctr/temp
train = os.path.join(base_path,train.csv) # path to training file
new_train = os.path.join(base_path,new_train_3rd.csv) # path to training file
test = os.path.join(base_path,test.csv ) # path to testing file
new_test = os.path.join(base_path,new_test_3rd.csv ) # path to testing file

id_count = collections.defaultdict(int)
ip_count = collections.defaultdict(int)
usr_count = collections.defaultdict(int)
usr_hour_count = collections.defaultdict(int)
usr_day_count = collections.defaultdict(int)
usr_hour_adid_count = collections.defaultdict(int)
usr_hour_adgroup_count = collections.defaultdict(int)
usr_day_adid_count = collections.defaultdict(int)
usr_day_adgroup_count = collections.defaultdict(int)
usr_day_count_unique = collections.defaultdict(set)
usr_day_appid_count = collections.defaultdict(int)
usr_day_siteid_count = collections.defaultdict(int)
usr_last_visit_diff = collections.defaultdict(int)
hour_record = {}

scan(train)
gen_data_3rd(train, new_train)

id_count = collections.defaultdict(int)
ip_count = collections.defaultdict(int)
usr_count = collections.defaultdict(int)
usr_hour_count = collections.defaultdict(int)
usr_day_count = collections.defaultdict(int)
usr_hour_adid_count = collections.defaultdict(int)
usr_hour_adgroup_count = collections.defaultdict(int)
usr_day_adid_count = collections.defaultdict(int)
usr_day_adgroup_count = collections.defaultdict(int)
usr_day_count_unique = collections.defaultdict(set)
usr_day_appid_count = collections.defaultdict(int)
usr_day_siteid_count = collections.defaultdict(int)
usr_last_visit_diff = collections.defaultdict(int)
hour_record = {}
scan(test)
gen_data_3rd(test, new_test, train=False)

# split train data to train and val dataset; and delete the data header;
newdata(train)
newdatatest(test)

用於生成 TensorFlow 機器學習模型特徵列的代碼:

# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Download and clean the Census Income Dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, copy
# pylint: disable=wrong-import-order
from absl import app as absl_app
from absl import flags
from six.moves import urllib
import tensorflow as tf
# pylint: enable=wrong-import-order

from official.utils.flags import core as flags_core

def _download_and_clean_file(filename, url):
"""Downloads data from url, and makes changes to match the CSV format."""
temp_file, _ = urllib.request.urlretrieve(url)
with tf.gfile.Open(temp_file, r) as temp_eval_file:
with tf.gfile.Open(filename, w) as eval_file:
for line in temp_eval_file:
line = line.strip()
line = line.replace(, , ,)
if not line or , not in line:
continue
if line[-1] == .:
line = line[:-1]
line +=

eval_file.write(line)
tf.gfile.Remove(temp_file)

def download(data_dir):
"""Download census data if it is not already present."""
tf.gfile.MakeDirs(data_dir)

training_file_path = os.path.join(data_dir, TRAINING_FILE)
if not tf.gfile.Exists(training_file_path):
_download_and_clean_file(training_file_path, TRAINING_URL)

eval_file_path = os.path.join(data_dir, EVAL_FILE)
if not tf.gfile.Exists(eval_file_path):
_download_and_clean_file(eval_file_path, EVAL_URL)

DATA_URL = https://archive.ics.uci.edu/ml/machine-learning-databases/adult
TRAINING_FILE = ctr_train4
TRAINING_URL = %s/%s % (DATA_URL, TRAINING_FILE)
EVAL_FILE = ctr_val4
EVAL_URL = %s/%s % (DATA_URL, EVAL_FILE)
TEST_FILE = ctr_test4

_CSV_COLUMNS_3rd = [id,
click,
hour,
C1,
banner_pos,
site_id,
site_domain,
site_category,
app_id,
app_domain,
app_category,
device_id,
device_ip,
device_model,
device_type,
device_conn_type,
C14,
C15,
C16,
C17,
C18,
C19,
C20,
C21,
device_id_cnt,
device_ip_cnt,
usr_cnt,
usr_hour_cnt,

new_usr,
usr_day_count, usr_day_count_unique, usr_hour_adid_count,
usr_hour_adgroup_count,
usr_day_adid_count, usr_day_adgroup_count,
usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,

mday, wday, weekend, tmhour, night, imgsize, device_id_2, device_ip_2
]
_CSV_COLUMNS_3rd_test = copy.copy(_CSV_COLUMNS_3rd)
_CSV_COLUMNS_3rd_test.remove(click)
_CSV_COLUMN_DEFAULTS_3rd = [
[], [0], [0], [0], [0], [],
[], [], [], [],
[], [], [], [], [0], [0],
[0], [0], [0],[0], [0], [0],[0], [0],
[0], [0],[0], [0],
[], [0], [0], [0],[0], [0], [0],[0], [0], [0],
[0], [0],[0], [0], [0], [], [], [],
]
_CSV_COLUMN_DEFAULTS_3rd_test = [
[], [0], [0], [0], [],
[], [], [], [],
[], [], [], [], [0], [0],
[0], [0], [0],[0], [0], [0],[0], [0],
[0], [0],[0], [0],
[], [0], [0], [0],[0], [0], [0],[0], [0], [0],
[0], [0],[0], [0], [0], [], [], [],
]
# 40428968 40428967+1(head)
_NUM_EXAMPLES = {
train: 40000000,
validation: 428967,
test: 4577464,
}

_HASH_BUCKET_SIZE = 100 # 1048576
_HASH_BUCKET_SIZE_LARGE = 1000000#000 # 1048576

def input_fn_3rd(data_file, num_epochs, shuffle, batch_size,test=False):
"""Generate an input function for the Estimator."""
assert tf.gfile.Exists(data_file), (
%s not found. Please make sure you have run census_dataset.py and
set the --data_dir argument to the correct path. % data_file)

print(data_file is :, data_file, test)
def parse_csv(value):
tf.logging.info(Parsing {}.format(data_file))
# columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
if test:
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS_3rd_test)
features = dict(zip(_CSV_COLUMNS_3rd_test, columns))
return features

else:
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS_3rd)
features = dict(zip(_CSV_COLUMNS_3rd, columns))
# labels = features.pop(income_bracket)
# classes = tf.equal(labels, >50K) # binary classification
classes = features.pop(click)

return features, classes

# Extract lines from input files using the Dataset API.
# dataset = tf.contrib.data.TextLineDataset(data_file)
dataset = tf.data.TextLineDataset(data_file)

if shuffle:
dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES[train])

dataset = dataset.map(parse_csv, num_parallel_calls=5)
# dataset = dataset.map(parse_csv)

# We call repeat after shuffling, rather than before, to prevent separate
# epochs from blending together.
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset

def build_model_columns_7():
"""Builds a set of wide and deep feature columns.
# adid click hour c1 banner_pos site_id
# 10138062202267342206, 0, 14102100,1005,0, 6c5b482c,
# site_domain site_category app_id app_domain
# 7687a86e, 3e814130, ecad2386, 7801e8d9,
# app_category device_id device_ip device_model device_type device_conn_type
# 07d7df22, a99f214a, 9bc437f6, ff2a3543, 1, 0,
# c14-c21
# 19015,300,250,2162,2,39,100217,33

(0, id,click,hour,C1,banner_pos,site_id,site_domain,site_category,
app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,
C14,C15,C16,C17,C18,C19,C20,C21,
device_id_cnt,device_ip_cnt,usr_cnt,usr_hour_cnt
)

add large scale cross feature
add 3rd features; change number feature cut off by 10;


"""

# id = tf.feature_column.categorical_column_with_hash_bucket(id, hash_bucket_size=10000)
# click = tf.feature_column.numeric_column(click)
hour = tf.feature_column.categorical_column_with_hash_bucket(hour, hash_bucket_size=300, dtype=tf.int64)

C1 = tf.feature_column.categorical_column_with_hash_bucket(C1, hash_bucket_size=20, dtype=tf.int64)
banner_pos = tf.feature_column.categorical_column_with_hash_bucket(banner_pos, hash_bucket_size=20, dtype=tf.int64)
site_id = tf.feature_column.categorical_column_with_hash_bucket(site_id, hash_bucket_size=7000)

site_domain = tf.feature_column.categorical_column_with_hash_bucket(site_domain, hash_bucket_size=12000)

site_category = tf.feature_column.categorical_column_with_hash_bucket(site_category, hash_bucket_size=50)

app_id = tf.feature_column.categorical_column_with_hash_bucket(app_id, hash_bucket_size=12000)

app_domain = tf.feature_column.categorical_column_with_hash_bucket(app_domain, hash_bucket_size=900)

app_category = tf.feature_column.categorical_column_with_hash_bucket(app_category, hash_bucket_size=100)

device_id = tf.feature_column.categorical_column_with_hash_bucket(device_id, hash_bucket_size=3000000)

device_ip = tf.feature_column.categorical_column_with_hash_bucket(device_ip, hash_bucket_size=8000000)

device_model = tf.feature_column.categorical_column_with_hash_bucket(device_model, hash_bucket_size=15000)

device_type = tf.feature_column.categorical_column_with_hash_bucket(device_type, hash_bucket_size=20, dtype=tf.int64)

device_conn_type = tf.feature_column.categorical_column_with_hash_bucket(device_conn_type, hash_bucket_size=20, dtype=tf.int64)

C14 = tf.feature_column.categorical_column_with_hash_bucket(C14, hash_bucket_size=5000, dtype=tf.int64)
C15 = tf.feature_column.categorical_column_with_hash_bucket(C15, hash_bucket_size=30, dtype=tf.int64)
C16 = tf.feature_column.categorical_column_with_hash_bucket(C16, hash_bucket_size=30, dtype=tf.int64)
C17 = tf.feature_column.categorical_column_with_hash_bucket(C17, hash_bucket_size=1000, dtype=tf.int64)
C18 = tf.feature_column.categorical_column_with_hash_bucket(C18, hash_bucket_size=30, dtype=tf.int64)
C19 = tf.feature_column.categorical_column_with_hash_bucket(C19, hash_bucket_size=200, dtype=tf.int64)
C20 = tf.feature_column.categorical_column_with_hash_bucket(C20, hash_bucket_size=500, dtype=tf.int64)
C21 = tf.feature_column.categorical_column_with_hash_bucket(C21, hash_bucket_size=200, dtype=tf.int64)

device_id_cnt_0 = tf.feature_column.numeric_column(device_id_cnt, dtype=tf.int64)
device_ip_cnt_0 = tf.feature_column.numeric_column(device_ip_cnt, dtype=tf.int64)
usr_cnt_0 = tf.feature_column.numeric_column(usr_cnt, dtype=tf.int64)
usr_hour_cnt_0 = tf.feature_column.numeric_column(usr_hour_cnt, dtype=tf.int64)

device_id_cnt = tf.feature_column.bucketized_column(device_id_cnt_0, boundaries=range(0,10000,10))
device_ip_cnt = tf.feature_column.bucketized_column(device_ip_cnt_0, boundaries=range(0,10000,10))
usr_cnt = tf.feature_column.bucketized_column(usr_cnt_0, boundaries=range(0,10000,10))
usr_hour_cnt = tf.feature_column.bucketized_column(usr_hour_cnt_0, boundaries=range(0,10000,10))

mday = tf.feature_column.categorical_column_with_hash_bucket(mday, hash_bucket_size=31, dtype=tf.int64)
wday = tf.feature_column.categorical_column_with_hash_bucket(wday, hash_bucket_size=7, dtype=tf.int64)
weekend = tf.feature_column.categorical_column_with_hash_bucket(weekend, hash_bucket_size=2, dtype=tf.int64)
tmhour = tf.feature_column.categorical_column_with_hash_bucket(tmhour, hash_bucket_size=24, dtype=tf.int64)
night = tf.feature_column.categorical_column_with_hash_bucket(night, hash_bucket_size=2, dtype=tf.int64)
imgsize = tf.feature_column.categorical_column_with_hash_bucket(imgsize, hash_bucket_size=100)

new_usr = tf.feature_column.categorical_column_with_hash_bucket(new_usr, hash_bucket_size=8000000)

usr_day_count_0 = tf.feature_column.numeric_column(usr_day_count, dtype=tf.int64)
usr_day_count_unique_0 = tf.feature_column.numeric_column(usr_day_count_unique, dtype=tf.int64)
usr_hour_adid_count_0 = tf.feature_column.numeric_column(usr_hour_adid_count, dtype=tf.int64)
usr_hour_adgroup_count_0 = tf.feature_column.numeric_column(usr_hour_adgroup_count, dtype=tf.int64)
usr_day_adid_count_0 = tf.feature_column.numeric_column(usr_day_adid_count, dtype=tf.int64)
usr_day_adgroup_count_0 = tf.feature_column.numeric_column(usr_day_adgroup_count, dtype=tf.int64)
usr_day_appid_count_0 = tf.feature_column.numeric_column(usr_day_appid_count, dtype=tf.int64)
usr_day_siteid_count_0 = tf.feature_column.numeric_column(usr_day_siteid_count, dtype=tf.int64)
usr_last_visit_diff_0 = tf.feature_column.numeric_column(usr_last_visit_diff, dtype=tf.int64)

usr_day_count = tf.feature_column.bucketized_column(usr_day_count_0, boundaries=range(0, 10000, 10))
usr_day_count_unique = tf.feature_column.bucketized_column(usr_day_count_unique_0, boundaries=range(0, 10000, 10))
usr_hour_adid_count = tf.feature_column.bucketized_column(usr_hour_adid_count_0, boundaries=range(0, 10000, 10))
usr_hour_adgroup_count = tf.feature_column.bucketized_column(usr_hour_adgroup_count_0, boundaries=range(0, 10000, 10))
usr_day_adid_count = tf.feature_column.bucketized_column(usr_day_adid_count_0, boundaries=range(0, 10000, 10))
usr_day_adgroup_count = tf.feature_column.bucketized_column(usr_day_adgroup_count_0, boundaries=range(0, 10000, 10))
usr_day_appid_count = tf.feature_column.bucketized_column(usr_day_appid_count_0, boundaries=range(0, 10000, 10))
usr_day_siteid_count = tf.feature_column.bucketized_column(usr_day_siteid_count_0, boundaries=range(0, 10000, 10))
usr_last_visit_diff = tf.feature_column.bucketized_column(usr_last_visit_diff_0, boundaries=range(0, 10000, 10))

# Wide columns and deep columns.
base_columns = [
hour, C1, banner_pos, site_id, site_domain, site_category, app_id,
app_domain, app_category, device_id, device_ip, device_model, device_type,
device_conn_type, C14, C15, C16, C17, C18, C19, C20, C21,
# can not add this to featrue , loss will jump up and down suddenly; FTRL can not handle number feature
device_id_cnt,device_ip_cnt,usr_cnt,usr_hour_cnt,
mday, wday, weekend, tmhour,
night, imgsize,
new_usr,
usr_day_count, usr_day_count_unique,
usr_hour_adid_count,
usr_hour_adgroup_count,
usr_day_adid_count, usr_day_adgroup_count,
usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,

]

CSV_COLUMNS = copy.copy(_CSV_COLUMNS_3rd_test)
CSV_COLUMNS.remove(id)
# CSV_COLUMNS.remove(device_id_2)
# CSV_COLUMNS.remove(device_ip_2)
L = len(CSV_COLUMNS)
x = sorted(CSV_COLUMNS)

# this corss process will largely increase runing time ...
two_rd_col = []
n = 0
for i in xrange(L):
for j in xrange(i + 1, L):
# if x[i] in [app_id, site_id] or x[j] in [app_id, site_id]:
n += 1
print(n, [x[i], x[j]]) # 595 feature 561
two_rd_col.append( tf.feature_column.crossed_column([x[i], x[j]], hash_bucket_size=10000))

# add 3 cross feature
# three_rd_col = []
# n = 0
# for i in xrange(L):
# for j in xrange(i + 1, L):
# for k in xrange(j + 1, L):
# n += 1
# print(n, i, j, k, [x[i], x[j], x[k]]) # 6545 feature 5984
# three_rd_col.append( tf.feature_column.crossed_column([x[i], x[j], x[k]], hash_bucket_size=100))

# tf.feature_column.crossed_column(
# [device_type, banner_pos], hash_bucket_size=1000),
# tf.feature_column.crossed_column(
# [device_conn_type, banner_pos, site_category],
# hash_bucket_size=_HASH_BUCKET_SIZE),
# tf.feature_column.crossed_column(
# [device_id, site_id], hash_bucket_size=100000),
# ]

wide_columns = base_columns + two_rd_col #+ three_rd_col

deep_columns = [
tf.feature_column.embedding_column(hour, dimension=2),
# tf.feature_column.indicator_column(C1),
tf.feature_column.embedding_column(C1,dimension=1),
tf.feature_column.embedding_column(banner_pos, dimension=1),
tf.feature_column.embedding_column(site_id, dimension=6),
tf.feature_column.embedding_column(site_domain, dimension=6),
tf.feature_column.embedding_column(site_category, dimension=2),
tf.feature_column.embedding_column(app_id, dimension=8),
tf.feature_column.embedding_column(app_domain, dimension=6),
tf.feature_column.embedding_column(app_category, dimension=2),
tf.feature_column.embedding_column(device_id, dimension=40),
tf.feature_column.embedding_column(device_ip, dimension=50),
tf.feature_column.embedding_column(device_model, dimension=8),
# tf.feature_column.indicator_column(device_type),
tf.feature_column.embedding_column(device_type, dimension=1),
tf.feature_column.embedding_column(device_conn_type, dimension=1),
tf.feature_column.embedding_column(C14, dimension=4),
tf.feature_column.embedding_column(C15, dimension=1),
tf.feature_column.embedding_column(C16, dimension=1),
tf.feature_column.embedding_column(C17, dimension=4),
tf.feature_column.embedding_column(C18, dimension=1),
tf.feature_column.embedding_column(C19, dimension=2),
tf.feature_column.embedding_column(C20, dimension=2),
tf.feature_column.embedding_column(C21, dimension=2),
tf.feature_column.embedding_column(device_id_cnt, dimension=2),
tf.feature_column.embedding_column(device_ip_cnt, dimension=2),
tf.feature_column.embedding_column(usr_cnt, dimension=2),
tf.feature_column.embedding_column(usr_hour_cnt, dimension=2),
tf.feature_column.embedding_column(mday, dimension=1),
tf.feature_column.embedding_column(wday, dimension=1),
tf.feature_column.embedding_column(weekend, dimension=1),
tf.feature_column.embedding_column(tmhour, dimension=1),
tf.feature_column.embedding_column(night, dimension=1),
tf.feature_column.embedding_column(imgsize, dimension=1),
tf.feature_column.embedding_column(new_usr, dimension=40),
tf.feature_column.embedding_column(usr_day_count, dimension=2),
tf.feature_column.embedding_column(usr_day_count_unique, dimension=2),
tf.feature_column.embedding_column(usr_hour_adid_count, dimension=2),
tf.feature_column.embedding_column(usr_hour_adgroup_count, dimension=2),
tf.feature_column.embedding_column(usr_day_adid_count, dimension=2),
tf.feature_column.embedding_column(usr_day_adgroup_count, dimension=2),
tf.feature_column.embedding_column(usr_day_appid_count, dimension=2),
tf.feature_column.embedding_column(usr_day_siteid_count, dimension=2),
tf.feature_column.embedding_column(usr_last_visit_diff, dimension=2),
]
# 2rd cross feature
deep_columns_2rd = [tf.feature_column.embedding_column(ff, dimension=4) for ff in two_rd_col]
# deep_columns_3rd = [tf.feature_column.embedding_column(ff, dimension=1) for ff in three_rd_col]

return wide_columns, deep_columns + deep_columns_2rd #+ deep_columns_3rd

def define_data_download_flags():
"""Add flags specifying data download arguments."""
flags.DEFINE_string(
name="data_dir", default=r/home/lon/zl/ctr/temp,
help=flags_core.help_wrap(
"Directory to download and extract data."))

def main(_):
download(flags.FLAGS.data_dir)

if __name__ == __main__:
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
absl_app.run(main)

最後運行:

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Train DNN on census income dataset."""

import os,sys

from absl import app as absl_app
from absl import flags
import tensorflow as tf

# sys.path.append(rC:UserslonDocumentsslowlonTFdemomy_programmodels-master)
# sys.path.append(r/workplace/zb/zl/ctr/models-master)
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.wide_deep import census_dataset
from official.wide_deep import wide_deep_run_loop

def define_census_flags():
wide_deep_run_loop.define_wide_deep_flags()
flags.adopt_module_key_flags(wide_deep_run_loop)
flags_core.set_defaults(data_dir=r/home/lon/zl/ctr/temp,
model_dir=r/home/lon/zl/ctr/temp/model6-32rd,
# model2 is 3rd model, very slow, 300s/100step;
# model2rd is 2rd model, 17s/100step;
# export_dir = r/home/lon/zl/ctr/temp/model1024-add12f,
train_epochs=4,
epochs_between_evals=1,
inter_op_parallelism_threads=0,
intra_op_parallelism_threads=0,
batch_size=56)

def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
"""Build an estimator appropriate for the given model type."""
wide_columns, deep_columns = model_column_fn()
# hidden_units = [100, 75, 50, 25]
# hidden_units = [256, 128, 64]
# hidden_units = [75, 50, 25]
# hidden_units = [24, 24, 12]
hidden_units = [1024, 512, 256]
# hidden_units = [1024, 512, 256, 128]

# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
# trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(
# device_count={GPU: 0},
inter_op_parallelism_threads=inter_op,
intra_op_parallelism_threads=intra_op))
# session_config = tf.ConfigProto(
# inter_op_parallelism_threads=inter_op,
# intra_op_parallelism_threads=intra_op))
if model_type == wide:
return tf.estimator.LinearClassifier(
model_dir=model_dir,
feature_columns=wide_columns,
config=run_config)
elif model_type == deep:
return tf.estimator.DNNClassifier(
model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=hidden_units,
config=run_config)
else:
return tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=hidden_units,
config=run_config)

def run_census(flags_obj):
"""Construct all necessary functions and call run_loop.

Args:
flags_obj: Object containing user specified flags.
"""
if flags_obj.download_if_missing:
census_dataset.download(flags_obj.data_dir)

train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE)
test_file = os.path.join(flags_obj.data_dir, census_dataset.EVAL_FILE)

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
return census_dataset.input_fn_3rd(
train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)

def eval_input_fn():
return census_dataset.input_fn_3rd(test_file, 1, False, flags_obj.batch_size)

tensors_to_log = {
average_loss: {loss_prefix}head/truediv,
loss: {loss_prefix}head/weighted_loss/Sum
}

wide_deep_run_loop.run_loop(
name="ctr train", train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
model_column_fn=census_dataset.build_model_columns_7,
build_estimator_fn=build_estimator,
flags_obj=flags_obj,
tensors_to_log=tensors_to_log,
early_stop=True)

def combine(path):
# trainf = open(os.path.join(base_path, ctr_train), w)
# valf = open(os.path.join(base_path, ctr_val), w)
testf = open(os.path.join(r/home/lon/zl/ctr/official/wide_deep,
submission_1.csv), r)
outfile = open(os.path.join(r/home/lon/zl/ctr/temp_result,
submission_wd_model6-32rd.csv), w)
outfile.write(id,click
)
print(here! )
for t, line in enumerate(open(path)):
line1 = testf.readline()
if t>0:
outfile.write(%s,%s
% (str(line.rstrip().split(,)[0]), str(line1.rstrip().split(,)[1])))

if t % 100000 == 0:
print(T,t)
print(t)
print(line.rstrip().split(,)[0], line1.rstrip().split(,)[1])

# trainf.close()
# valf.close()
testf.close()
outfile.close()

test = os.path.join(r/home/lon/zl/ctr/temp,test.csv ) # path to testing file

def run_test(flags_obj):
"""Construct all necessary functions and call run_loop.

Args:
flags_obj: Object containing user specified flags.
"""
# if flags_obj.download_if_missing:
# census_dataset.download(flags_obj.data_dir)

# train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE)
test_file = os.path.join(flags_obj.data_dir, census_dataset.TEST_FILE)

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
# def train_input_fn():
# return census_dataset.input_fn(
# train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)

def test_input_fn():
return census_dataset.input_fn_3rd(test_file, 1, False, 600, test=True)

tensors_to_log = {
average_loss: {loss_prefix}head/truediv,
loss: {loss_prefix}head/weighted_loss/Sum
}

wide_deep_run_loop.predict_loop(
name="ctr test",
test_input_fn=test_input_fn,
model_column_fn=census_dataset.build_model_columns_7,
build_estimator_fn=build_estimator,
flags_obj=flags_obj,
tensors_to_log=tensors_to_log
)

combine(test)
#

def main(_):
with logger.benchmark_context(flags.FLAGS):
run_census(flags.FLAGS)

def main_test(_):
with logger.benchmark_context(flags.FLAGS):
run_test(flags.FLAGS)
if __name__ == __main__:
tf.logging.set_verbosity(tf.logging.INFO)
define_census_flags()
absl_app.run(main)
absl_app.run(main_test)

幾點經驗:

  • 特徵列中可以自己加入所有變數之間的二階交叉特徵,會較好的提高模型的表現,但是同時也會大大加長模型訓練的時間,所以最後只挑了一部分變數做完全交叉
  • 新生成的特徵中以第三階段加入的時間為基礎的特徵(part 3 feature)最為有用,會較好的提高模型的表現
  • categorical_column_with_hash_bucket 類別數量要能夠覆蓋訓練集和測試集的所有類別數量
  • embedding 的維度不用太大
  • 直接使用numeric_column會導致模型效果變的很差,後續都是把它變為類別變數
  • indicator 特徵列似乎不太起作用,不如 embedding 效果好
  • 過早的停止訓練模型並不會起到很好的效果

推薦閱讀:

相關文章