最近嘗試了用TensorFlow 的wide&deep模型做了 kaggle 的一個 CTR 點擊預測比賽,效果還不錯,最後應該可以排到前一百,不過這個比賽是一個4年前的比賽了,主要是貼一貼思路和代碼,代碼改寫自官方的 wide&deep 示例中的 census 模型代碼。
原有的特徵和生成新的特徵,具體生成了哪些新的特徵就不一一說明瞭,看代碼吧。
#coding:utf-8
import collections,csv,sys,pickle,os,time import numpy as np
_CSV_COLUMNS = [id, click, hour, C1, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14, C15, C16, C17, C18, C19, C20, C21] NEW_CSV_COLUMNS = _CSV_COLUMNS + [device_id_cnt, device_ip_cnt, usr_cnt, usr_hour_cnt, new_usr,
usr_day_count, usr_day_count_unique, usr_hour_adid_count, usr_hour_adgroup_count, usr_day_adid_count, usr_day_adgroup_count, usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff, ] NEW_CSV_COLUMNS_hour = NEW_CSV_COLUMNS + [mday, wday, weekend, tmhour, night, imgsize, # device_id_2, device_ip_2 ]
_CSV_COLUMNS_test = [id, hour, C1, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14, C15, C16, C17, C18, C19, C20, C21] NEW_CSV_COLUMNS_test = _CSV_COLUMNS_test + [device_id_cnt, device_ip_cnt, usr_cnt, usr_hour_cnt, new_usr,
usr_day_count, usr_day_count_unique, usr_hour_adid_count, usr_hour_adgroup_count, usr_day_adid_count, usr_day_adgroup_count, usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff, ]
NEW_CSV_COLUMNS_test_hour = NEW_CSV_COLUMNS_test + [mday, wday, weekend, tmhour, night, imgsize, # device_id_2, device_ip_2 ]
# 設備 ID 空欄位處理 def def_usr(row): if row[device_id] == a99f214a: usr = ip- + row[device_ip] + row[device_model] else: # print(row[device_id]) usr = id- + row[device_id]
return usr
def scan(path): for i, row in enumerate(csv.DictReader(open(path)),start=1):
usr = def_usr(row)
id_count[row[device_id]] += 1 ip_count[row[device_ip]] += 1 usr_count[usr] += 1 usr_hour_count[usr +-+ row[hour]] += 1 # number of impressions to the user in the hour
# part 3 ,add at 20190410 usr_day_count[usr +-+ row[hour][0:6]] += 1 # number of impressions to the user in the day usr_day_count_unique[usr].add(row[hour][0:6]) # number of days user appeared
usr_hour_adid_count[usr +-+ row[C14]+-+ row[hour]] += 1 # number of impressions to the user for the adid in the hour usr_hour_adgroup_count[usr +-+ row[C17]+-+ row[hour]] += 1 # number of impressions to the user for the adgroup in the hour usr_day_adid_count[usr +-+ row[C14]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the adid in the day usr_day_adgroup_count[usr +-+ row[C17]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the adgroup in the day
usr_day_appid_count[usr +-+ row[app_id]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the appid in the day usr_day_siteid_count[usr +-+ row[site_id]+-+ row[hour][0:6]] += 1 # number of impressions to the user for the siteid in the day
hour0 = hour_record.get(usr, 01010101) hour0 = time.strptime(hour0, %y%m%d%H) hour1 = time.strptime(row[hour], %y%m%d%H) usr_last_visit_diff[usr +-+ row[hour]] = 24*(hour1.tm_yday - hour0.tm_yday) + (hour1.tm_hour - hour0.tm_hour) # interval of visit time hour_record.update({usr: row[hour]})
if i%100000 == 0: print(i, row) print(hour0, hour1, usr_last_visit_diff[usr +-+ row[hour]])
def gen_data_3rd(src_path, dst_path, train=True): reader = csv.DictReader(open(src_path)) if train: # writer = csv.DictWriter(open(dst_path, w, newline=), NEW_CSV_COLUMNS_hour) writer = csv.DictWriter(open(dst_path, w), NEW_CSV_COLUMNS_hour) else: # writer = csv.DictWriter(open(dst_path, w, newline=), NEW_CSV_COLUMNS_test_hour) writer = csv.DictWriter(open(dst_path, w), NEW_CSV_COLUMNS_test_hour) writer.writeheader()
for i, row in enumerate(reader, start=1): if i % 100000 == 0: print(i, row)
new_row = {}
if train: for field in _CSV_COLUMNS: new_row[field] = row[field] else: for field in _CSV_COLUMNS_test: new_row[field] = row[field]
# part 1 feature usr = def_usr(row) new_row[device_id_cnt] = id_count[row[device_id]] new_row[device_ip_cnt] = ip_count[row[device_ip]] new_row[usr_cnt] = usr_count[usr] new_row[usr_hour_cnt] = usr_hour_count[usr + - + row[hour]] # 這裡bu使用了str
new_row[new_usr] = usr
# part 2 feature hour = time.strptime(new_row[hour], %y%m%d%H) new_row[mday] = hour.tm_mday new_row[wday] = hour.tm_wday new_row[tmhour] = hour.tm_hour new_row[weekend] = int(hour.tm_wday == 5 or hour.tm_wday == 6)
new_row[night] = int(hour.tm_hour in [20, 21, 22, 23, 24, 1, 2, 3, 4, 5, 6, 7, 8]) new_row[imgsize] = new_row[C15] + * + new_row[C16] # new_row[device_id_2] = new_row[device_id][0:4] # new_row[device_ip_2] = new_row[device_ip][0:4]
# part 3 feature new_row[usr_day_count] = usr_day_count[usr +-+ row[hour][0:6]] new_row[usr_day_count_unique] = len(usr_day_count_unique[usr]) new_row[usr_hour_adid_count] = usr_hour_adid_count[usr + - + row[C14] + - + row[hour]] new_row[usr_hour_adgroup_count] = usr_hour_adgroup_count[usr + - + row[C17] + - + row[hour]] new_row[usr_day_adid_count] = usr_day_adid_count[usr + - + row[C14] + - + row[hour][0:6]] new_row[usr_day_adgroup_count] = usr_day_adgroup_count[usr + - + row[C17] + - + row[hour][0:6]] new_row[usr_day_appid_count] = usr_day_appid_count[usr + - + row[app_id] + - + row[hour][0:6]] new_row[usr_day_siteid_count] = usr_day_siteid_count[usr + - + row[site_id] + - + row[hour][0:6]] new_row[usr_last_visit_diff] = usr_last_visit_diff[usr + - + row[hour]]
if i % 100000 == 0: print(new, i, new_row) writer.writerow(new_row)
# split train data def newdata(path): trainf = open(os.path.join(base_path, ctr_train4), w) valf = open(os.path.join(base_path, ctr_val4), w) for t, line in enumerate(open(path)): temp = np.random.rand(1) if temp<0.01 and t>0: valf.write(line) elif t>0: trainf.write(line) else: print (t,line)
if t % 100000 == 0: print(t, line) print(t) trainf.close() valf.close()
# remove header def newdatatest(path): testf = open(os.path.join(base_path, ctr_test4), w) for t, line in enumerate(open(path)): if t>0: testf.write(line) # valf.write(line) else: print (t,line)
if t % 100000 == 0: print(t, line) print(t) testf.close()
if __name__ == "__main__":
base_path = r/home/lon/zl/ctr/temp train = os.path.join(base_path,train.csv) # path to training file new_train = os.path.join(base_path,new_train_3rd.csv) # path to training file test = os.path.join(base_path,test.csv ) # path to testing file new_test = os.path.join(base_path,new_test_3rd.csv ) # path to testing file
id_count = collections.defaultdict(int) ip_count = collections.defaultdict(int) usr_count = collections.defaultdict(int) usr_hour_count = collections.defaultdict(int) usr_day_count = collections.defaultdict(int) usr_hour_adid_count = collections.defaultdict(int) usr_hour_adgroup_count = collections.defaultdict(int) usr_day_adid_count = collections.defaultdict(int) usr_day_adgroup_count = collections.defaultdict(int) usr_day_count_unique = collections.defaultdict(set) usr_day_appid_count = collections.defaultdict(int) usr_day_siteid_count = collections.defaultdict(int) usr_last_visit_diff = collections.defaultdict(int) hour_record = {}
scan(train) gen_data_3rd(train, new_train)
id_count = collections.defaultdict(int) ip_count = collections.defaultdict(int) usr_count = collections.defaultdict(int) usr_hour_count = collections.defaultdict(int) usr_day_count = collections.defaultdict(int) usr_hour_adid_count = collections.defaultdict(int) usr_hour_adgroup_count = collections.defaultdict(int) usr_day_adid_count = collections.defaultdict(int) usr_day_adgroup_count = collections.defaultdict(int) usr_day_count_unique = collections.defaultdict(set) usr_day_appid_count = collections.defaultdict(int) usr_day_siteid_count = collections.defaultdict(int) usr_last_visit_diff = collections.defaultdict(int) hour_record = {} scan(test) gen_data_3rd(test, new_test, train=False)
# split train data to train and val dataset; and delete the data header; newdata(train) newdatatest(test)
用於生成 TensorFlow 機器學習模型特徵列的代碼:
# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Download and clean the Census Income Dataset."""
from __future__ import absolute_import from __future__ import division from __future__ import print_function
import os, copy # pylint: disable=wrong-import-order from absl import app as absl_app from absl import flags from six.moves import urllib import tensorflow as tf # pylint: enable=wrong-import-order
from official.utils.flags import core as flags_core
def _download_and_clean_file(filename, url): """Downloads data from url, and makes changes to match the CSV format.""" temp_file, _ = urllib.request.urlretrieve(url) with tf.gfile.Open(temp_file, r) as temp_eval_file: with tf.gfile.Open(filename, w) as eval_file: for line in temp_eval_file: line = line.strip() line = line.replace(, , ,) if not line or , not in line: continue if line[-1] == .: line = line[:-1] line += eval_file.write(line) tf.gfile.Remove(temp_file)
def download(data_dir): """Download census data if it is not already present.""" tf.gfile.MakeDirs(data_dir)
training_file_path = os.path.join(data_dir, TRAINING_FILE) if not tf.gfile.Exists(training_file_path): _download_and_clean_file(training_file_path, TRAINING_URL)
eval_file_path = os.path.join(data_dir, EVAL_FILE) if not tf.gfile.Exists(eval_file_path): _download_and_clean_file(eval_file_path, EVAL_URL)
DATA_URL = https://archive.ics.uci.edu/ml/machine-learning-databases/adult TRAINING_FILE = ctr_train4 TRAINING_URL = %s/%s % (DATA_URL, TRAINING_FILE) EVAL_FILE = ctr_val4 EVAL_URL = %s/%s % (DATA_URL, EVAL_FILE) TEST_FILE = ctr_test4
_CSV_COLUMNS_3rd = [id, click, hour, C1, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14, C15, C16, C17, C18, C19, C20, C21, device_id_cnt, device_ip_cnt, usr_cnt, usr_hour_cnt,
new_usr, usr_day_count, usr_day_count_unique, usr_hour_adid_count, usr_hour_adgroup_count, usr_day_adid_count, usr_day_adgroup_count, usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,
mday, wday, weekend, tmhour, night, imgsize, device_id_2, device_ip_2 ] _CSV_COLUMNS_3rd_test = copy.copy(_CSV_COLUMNS_3rd) _CSV_COLUMNS_3rd_test.remove(click) _CSV_COLUMN_DEFAULTS_3rd = [ [], [0], [0], [0], [0], [], [], [], [], [], [], [], [], [], [0], [0], [0], [0], [0],[0], [0], [0],[0], [0], [0], [0],[0], [0], [], [0], [0], [0],[0], [0], [0],[0], [0], [0], [0], [0],[0], [0], [0], [], [], [], ] _CSV_COLUMN_DEFAULTS_3rd_test = [ [], [0], [0], [0], [], [], [], [], [], [], [], [], [], [0], [0], [0], [0], [0],[0], [0], [0],[0], [0], [0], [0],[0], [0], [], [0], [0], [0],[0], [0], [0],[0], [0], [0], [0], [0],[0], [0], [0], [], [], [], ] # 40428968 40428967+1(head) _NUM_EXAMPLES = { train: 40000000, validation: 428967, test: 4577464, }
_HASH_BUCKET_SIZE = 100 # 1048576 _HASH_BUCKET_SIZE_LARGE = 1000000#000 # 1048576
def input_fn_3rd(data_file, num_epochs, shuffle, batch_size,test=False): """Generate an input function for the Estimator.""" assert tf.gfile.Exists(data_file), ( %s not found. Please make sure you have run census_dataset.py and set the --data_dir argument to the correct path. % data_file)
print(data_file is :, data_file, test) def parse_csv(value): tf.logging.info(Parsing {}.format(data_file)) # columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS) if test: columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS_3rd_test) features = dict(zip(_CSV_COLUMNS_3rd_test, columns)) return features
else: columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS_3rd) features = dict(zip(_CSV_COLUMNS_3rd, columns)) # labels = features.pop(income_bracket) # classes = tf.equal(labels, >50K) # binary classification classes = features.pop(click)
return features, classes
# Extract lines from input files using the Dataset API. # dataset = tf.contrib.data.TextLineDataset(data_file) dataset = tf.data.TextLineDataset(data_file)
if shuffle: dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES[train])
dataset = dataset.map(parse_csv, num_parallel_calls=5) # dataset = dataset.map(parse_csv)
# We call repeat after shuffling, rather than before, to prevent separate # epochs from blending together. dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size) return dataset
def build_model_columns_7(): """Builds a set of wide and deep feature columns. # adid click hour c1 banner_pos site_id # 10138062202267342206, 0, 14102100,1005,0, 6c5b482c, # site_domain site_category app_id app_domain # 7687a86e, 3e814130, ecad2386, 7801e8d9, # app_category device_id device_ip device_model device_type device_conn_type # 07d7df22, a99f214a, 9bc437f6, ff2a3543, 1, 0, # c14-c21 # 19015,300,250,2162,2,39,100217,33 (0, id,click,hour,C1,banner_pos,site_id,site_domain,site_category, app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type, C14,C15,C16,C17,C18,C19,C20,C21, device_id_cnt,device_ip_cnt,usr_cnt,usr_hour_cnt ) add large scale cross feature add 3rd features; change number feature cut off by 10; """
# id = tf.feature_column.categorical_column_with_hash_bucket(id, hash_bucket_size=10000) # click = tf.feature_column.numeric_column(click) hour = tf.feature_column.categorical_column_with_hash_bucket(hour, hash_bucket_size=300, dtype=tf.int64)
C1 = tf.feature_column.categorical_column_with_hash_bucket(C1, hash_bucket_size=20, dtype=tf.int64) banner_pos = tf.feature_column.categorical_column_with_hash_bucket(banner_pos, hash_bucket_size=20, dtype=tf.int64) site_id = tf.feature_column.categorical_column_with_hash_bucket(site_id, hash_bucket_size=7000)
site_domain = tf.feature_column.categorical_column_with_hash_bucket(site_domain, hash_bucket_size=12000)
site_category = tf.feature_column.categorical_column_with_hash_bucket(site_category, hash_bucket_size=50)
app_id = tf.feature_column.categorical_column_with_hash_bucket(app_id, hash_bucket_size=12000)
app_domain = tf.feature_column.categorical_column_with_hash_bucket(app_domain, hash_bucket_size=900)
app_category = tf.feature_column.categorical_column_with_hash_bucket(app_category, hash_bucket_size=100)
device_id = tf.feature_column.categorical_column_with_hash_bucket(device_id, hash_bucket_size=3000000)
device_ip = tf.feature_column.categorical_column_with_hash_bucket(device_ip, hash_bucket_size=8000000)
device_model = tf.feature_column.categorical_column_with_hash_bucket(device_model, hash_bucket_size=15000)
device_type = tf.feature_column.categorical_column_with_hash_bucket(device_type, hash_bucket_size=20, dtype=tf.int64)
device_conn_type = tf.feature_column.categorical_column_with_hash_bucket(device_conn_type, hash_bucket_size=20, dtype=tf.int64)
C14 = tf.feature_column.categorical_column_with_hash_bucket(C14, hash_bucket_size=5000, dtype=tf.int64) C15 = tf.feature_column.categorical_column_with_hash_bucket(C15, hash_bucket_size=30, dtype=tf.int64) C16 = tf.feature_column.categorical_column_with_hash_bucket(C16, hash_bucket_size=30, dtype=tf.int64) C17 = tf.feature_column.categorical_column_with_hash_bucket(C17, hash_bucket_size=1000, dtype=tf.int64) C18 = tf.feature_column.categorical_column_with_hash_bucket(C18, hash_bucket_size=30, dtype=tf.int64) C19 = tf.feature_column.categorical_column_with_hash_bucket(C19, hash_bucket_size=200, dtype=tf.int64) C20 = tf.feature_column.categorical_column_with_hash_bucket(C20, hash_bucket_size=500, dtype=tf.int64) C21 = tf.feature_column.categorical_column_with_hash_bucket(C21, hash_bucket_size=200, dtype=tf.int64)
device_id_cnt_0 = tf.feature_column.numeric_column(device_id_cnt, dtype=tf.int64) device_ip_cnt_0 = tf.feature_column.numeric_column(device_ip_cnt, dtype=tf.int64) usr_cnt_0 = tf.feature_column.numeric_column(usr_cnt, dtype=tf.int64) usr_hour_cnt_0 = tf.feature_column.numeric_column(usr_hour_cnt, dtype=tf.int64)
device_id_cnt = tf.feature_column.bucketized_column(device_id_cnt_0, boundaries=range(0,10000,10)) device_ip_cnt = tf.feature_column.bucketized_column(device_ip_cnt_0, boundaries=range(0,10000,10)) usr_cnt = tf.feature_column.bucketized_column(usr_cnt_0, boundaries=range(0,10000,10)) usr_hour_cnt = tf.feature_column.bucketized_column(usr_hour_cnt_0, boundaries=range(0,10000,10))
mday = tf.feature_column.categorical_column_with_hash_bucket(mday, hash_bucket_size=31, dtype=tf.int64) wday = tf.feature_column.categorical_column_with_hash_bucket(wday, hash_bucket_size=7, dtype=tf.int64) weekend = tf.feature_column.categorical_column_with_hash_bucket(weekend, hash_bucket_size=2, dtype=tf.int64) tmhour = tf.feature_column.categorical_column_with_hash_bucket(tmhour, hash_bucket_size=24, dtype=tf.int64) night = tf.feature_column.categorical_column_with_hash_bucket(night, hash_bucket_size=2, dtype=tf.int64) imgsize = tf.feature_column.categorical_column_with_hash_bucket(imgsize, hash_bucket_size=100)
new_usr = tf.feature_column.categorical_column_with_hash_bucket(new_usr, hash_bucket_size=8000000)
usr_day_count_0 = tf.feature_column.numeric_column(usr_day_count, dtype=tf.int64) usr_day_count_unique_0 = tf.feature_column.numeric_column(usr_day_count_unique, dtype=tf.int64) usr_hour_adid_count_0 = tf.feature_column.numeric_column(usr_hour_adid_count, dtype=tf.int64) usr_hour_adgroup_count_0 = tf.feature_column.numeric_column(usr_hour_adgroup_count, dtype=tf.int64) usr_day_adid_count_0 = tf.feature_column.numeric_column(usr_day_adid_count, dtype=tf.int64) usr_day_adgroup_count_0 = tf.feature_column.numeric_column(usr_day_adgroup_count, dtype=tf.int64) usr_day_appid_count_0 = tf.feature_column.numeric_column(usr_day_appid_count, dtype=tf.int64) usr_day_siteid_count_0 = tf.feature_column.numeric_column(usr_day_siteid_count, dtype=tf.int64) usr_last_visit_diff_0 = tf.feature_column.numeric_column(usr_last_visit_diff, dtype=tf.int64)
usr_day_count = tf.feature_column.bucketized_column(usr_day_count_0, boundaries=range(0, 10000, 10)) usr_day_count_unique = tf.feature_column.bucketized_column(usr_day_count_unique_0, boundaries=range(0, 10000, 10)) usr_hour_adid_count = tf.feature_column.bucketized_column(usr_hour_adid_count_0, boundaries=range(0, 10000, 10)) usr_hour_adgroup_count = tf.feature_column.bucketized_column(usr_hour_adgroup_count_0, boundaries=range(0, 10000, 10)) usr_day_adid_count = tf.feature_column.bucketized_column(usr_day_adid_count_0, boundaries=range(0, 10000, 10)) usr_day_adgroup_count = tf.feature_column.bucketized_column(usr_day_adgroup_count_0, boundaries=range(0, 10000, 10)) usr_day_appid_count = tf.feature_column.bucketized_column(usr_day_appid_count_0, boundaries=range(0, 10000, 10)) usr_day_siteid_count = tf.feature_column.bucketized_column(usr_day_siteid_count_0, boundaries=range(0, 10000, 10)) usr_last_visit_diff = tf.feature_column.bucketized_column(usr_last_visit_diff_0, boundaries=range(0, 10000, 10))
# Wide columns and deep columns. base_columns = [ hour, C1, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14, C15, C16, C17, C18, C19, C20, C21, # can not add this to featrue , loss will jump up and down suddenly; FTRL can not handle number feature device_id_cnt,device_ip_cnt,usr_cnt,usr_hour_cnt, mday, wday, weekend, tmhour, night, imgsize, new_usr, usr_day_count, usr_day_count_unique, usr_hour_adid_count, usr_hour_adgroup_count, usr_day_adid_count, usr_day_adgroup_count, usr_day_appid_count, usr_day_siteid_count, usr_last_visit_diff,
]
CSV_COLUMNS = copy.copy(_CSV_COLUMNS_3rd_test) CSV_COLUMNS.remove(id) # CSV_COLUMNS.remove(device_id_2) # CSV_COLUMNS.remove(device_ip_2) L = len(CSV_COLUMNS) x = sorted(CSV_COLUMNS)
# this corss process will largely increase runing time ... two_rd_col = [] n = 0 for i in xrange(L): for j in xrange(i + 1, L): # if x[i] in [app_id, site_id] or x[j] in [app_id, site_id]: n += 1 print(n, [x[i], x[j]]) # 595 feature 561 two_rd_col.append( tf.feature_column.crossed_column([x[i], x[j]], hash_bucket_size=10000))
# add 3 cross feature # three_rd_col = [] # n = 0 # for i in xrange(L): # for j in xrange(i + 1, L): # for k in xrange(j + 1, L): # n += 1 # print(n, i, j, k, [x[i], x[j], x[k]]) # 6545 feature 5984 # three_rd_col.append( tf.feature_column.crossed_column([x[i], x[j], x[k]], hash_bucket_size=100))
# tf.feature_column.crossed_column( # [device_type, banner_pos], hash_bucket_size=1000), # tf.feature_column.crossed_column( # [device_conn_type, banner_pos, site_category], # hash_bucket_size=_HASH_BUCKET_SIZE), # tf.feature_column.crossed_column( # [device_id, site_id], hash_bucket_size=100000), # ]
wide_columns = base_columns + two_rd_col #+ three_rd_col
deep_columns = [ tf.feature_column.embedding_column(hour, dimension=2), # tf.feature_column.indicator_column(C1), tf.feature_column.embedding_column(C1,dimension=1), tf.feature_column.embedding_column(banner_pos, dimension=1), tf.feature_column.embedding_column(site_id, dimension=6), tf.feature_column.embedding_column(site_domain, dimension=6), tf.feature_column.embedding_column(site_category, dimension=2), tf.feature_column.embedding_column(app_id, dimension=8), tf.feature_column.embedding_column(app_domain, dimension=6), tf.feature_column.embedding_column(app_category, dimension=2), tf.feature_column.embedding_column(device_id, dimension=40), tf.feature_column.embedding_column(device_ip, dimension=50), tf.feature_column.embedding_column(device_model, dimension=8), # tf.feature_column.indicator_column(device_type), tf.feature_column.embedding_column(device_type, dimension=1), tf.feature_column.embedding_column(device_conn_type, dimension=1), tf.feature_column.embedding_column(C14, dimension=4), tf.feature_column.embedding_column(C15, dimension=1), tf.feature_column.embedding_column(C16, dimension=1), tf.feature_column.embedding_column(C17, dimension=4), tf.feature_column.embedding_column(C18, dimension=1), tf.feature_column.embedding_column(C19, dimension=2), tf.feature_column.embedding_column(C20, dimension=2), tf.feature_column.embedding_column(C21, dimension=2), tf.feature_column.embedding_column(device_id_cnt, dimension=2), tf.feature_column.embedding_column(device_ip_cnt, dimension=2), tf.feature_column.embedding_column(usr_cnt, dimension=2), tf.feature_column.embedding_column(usr_hour_cnt, dimension=2), tf.feature_column.embedding_column(mday, dimension=1), tf.feature_column.embedding_column(wday, dimension=1), tf.feature_column.embedding_column(weekend, dimension=1), tf.feature_column.embedding_column(tmhour, dimension=1), tf.feature_column.embedding_column(night, dimension=1), tf.feature_column.embedding_column(imgsize, dimension=1), tf.feature_column.embedding_column(new_usr, dimension=40), tf.feature_column.embedding_column(usr_day_count, dimension=2), tf.feature_column.embedding_column(usr_day_count_unique, dimension=2), tf.feature_column.embedding_column(usr_hour_adid_count, dimension=2), tf.feature_column.embedding_column(usr_hour_adgroup_count, dimension=2), tf.feature_column.embedding_column(usr_day_adid_count, dimension=2), tf.feature_column.embedding_column(usr_day_adgroup_count, dimension=2), tf.feature_column.embedding_column(usr_day_appid_count, dimension=2), tf.feature_column.embedding_column(usr_day_siteid_count, dimension=2), tf.feature_column.embedding_column(usr_last_visit_diff, dimension=2), ] # 2rd cross feature deep_columns_2rd = [tf.feature_column.embedding_column(ff, dimension=4) for ff in two_rd_col] # deep_columns_3rd = [tf.feature_column.embedding_column(ff, dimension=1) for ff in three_rd_col]
return wide_columns, deep_columns + deep_columns_2rd #+ deep_columns_3rd
def define_data_download_flags(): """Add flags specifying data download arguments.""" flags.DEFINE_string( name="data_dir", default=r/home/lon/zl/ctr/temp, help=flags_core.help_wrap( "Directory to download and extract data."))
def main(_): download(flags.FLAGS.data_dir)
if __name__ == __main__: tf.logging.set_verbosity(tf.logging.INFO) define_data_download_flags() absl_app.run(main)
最後運行:
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Train DNN on census income dataset."""
import os,sys
from absl import app as absl_app from absl import flags import tensorflow as tf
# sys.path.append(rC:UserslonDocumentsslowlonTFdemomy_programmodels-master) # sys.path.append(r/workplace/zb/zl/ctr/models-master) from official.utils.flags import core as flags_core from official.utils.logs import logger from official.wide_deep import census_dataset from official.wide_deep import wide_deep_run_loop
def define_census_flags(): wide_deep_run_loop.define_wide_deep_flags() flags.adopt_module_key_flags(wide_deep_run_loop) flags_core.set_defaults(data_dir=r/home/lon/zl/ctr/temp, model_dir=r/home/lon/zl/ctr/temp/model6-32rd, # model2 is 3rd model, very slow, 300s/100step; # model2rd is 2rd model, 17s/100step; # export_dir = r/home/lon/zl/ctr/temp/model1024-add12f, train_epochs=4, epochs_between_evals=1, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, batch_size=56)
def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op): """Build an estimator appropriate for the given model type.""" wide_columns, deep_columns = model_column_fn() # hidden_units = [100, 75, 50, 25] # hidden_units = [256, 128, 64] # hidden_units = [75, 50, 25] # hidden_units = [24, 24, 12] hidden_units = [1024, 512, 256] # hidden_units = [1024, 512, 256, 128]
# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which # trains faster than GPU for this model. run_config = tf.estimator.RunConfig().replace( session_config=tf.ConfigProto( # device_count={GPU: 0}, inter_op_parallelism_threads=inter_op, intra_op_parallelism_threads=intra_op)) # session_config = tf.ConfigProto( # inter_op_parallelism_threads=inter_op, # intra_op_parallelism_threads=intra_op)) if model_type == wide: return tf.estimator.LinearClassifier( model_dir=model_dir, feature_columns=wide_columns, config=run_config) elif model_type == deep: return tf.estimator.DNNClassifier( model_dir=model_dir, feature_columns=deep_columns, hidden_units=hidden_units, config=run_config) else: return tf.estimator.DNNLinearCombinedClassifier( model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=hidden_units, config=run_config)
def run_census(flags_obj): """Construct all necessary functions and call run_loop.
Args: flags_obj: Object containing user specified flags. """ if flags_obj.download_if_missing: census_dataset.download(flags_obj.data_dir)
train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE) test_file = os.path.join(flags_obj.data_dir, census_dataset.EVAL_FILE)
# Train and evaluate the model every `flags.epochs_between_evals` epochs. def train_input_fn(): return census_dataset.input_fn_3rd( train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
def eval_input_fn(): return census_dataset.input_fn_3rd(test_file, 1, False, flags_obj.batch_size)
tensors_to_log = { average_loss: {loss_prefix}head/truediv, loss: {loss_prefix}head/weighted_loss/Sum }
wide_deep_run_loop.run_loop( name="ctr train", train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, model_column_fn=census_dataset.build_model_columns_7, build_estimator_fn=build_estimator, flags_obj=flags_obj, tensors_to_log=tensors_to_log, early_stop=True)
def combine(path): # trainf = open(os.path.join(base_path, ctr_train), w) # valf = open(os.path.join(base_path, ctr_val), w) testf = open(os.path.join(r/home/lon/zl/ctr/official/wide_deep, submission_1.csv), r) outfile = open(os.path.join(r/home/lon/zl/ctr/temp_result, submission_wd_model6-32rd.csv), w) outfile.write(id,click ) print(here! ) for t, line in enumerate(open(path)): line1 = testf.readline() if t>0: outfile.write(%s,%s % (str(line.rstrip().split(,)[0]), str(line1.rstrip().split(,)[1])))
if t % 100000 == 0: print(T,t) print(t) print(line.rstrip().split(,)[0], line1.rstrip().split(,)[1])
# trainf.close() # valf.close() testf.close() outfile.close()
test = os.path.join(r/home/lon/zl/ctr/temp,test.csv ) # path to testing file
def run_test(flags_obj): """Construct all necessary functions and call run_loop.
Args: flags_obj: Object containing user specified flags. """ # if flags_obj.download_if_missing: # census_dataset.download(flags_obj.data_dir)
# train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE) test_file = os.path.join(flags_obj.data_dir, census_dataset.TEST_FILE)
# Train and evaluate the model every `flags.epochs_between_evals` epochs. # def train_input_fn(): # return census_dataset.input_fn( # train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
def test_input_fn(): return census_dataset.input_fn_3rd(test_file, 1, False, 600, test=True)
wide_deep_run_loop.predict_loop( name="ctr test", test_input_fn=test_input_fn, model_column_fn=census_dataset.build_model_columns_7, build_estimator_fn=build_estimator, flags_obj=flags_obj, tensors_to_log=tensors_to_log )
combine(test) #
def main(_): with logger.benchmark_context(flags.FLAGS): run_census(flags.FLAGS)
def main_test(_): with logger.benchmark_context(flags.FLAGS): run_test(flags.FLAGS) if __name__ == __main__: tf.logging.set_verbosity(tf.logging.INFO) define_census_flags() absl_app.run(main) absl_app.run(main_test)
幾點經驗:
推薦閱讀: