Basic Usage
This chapter introduces the basic usage methods of RecIS to help beginners get started quickly.
Environment Setup
Before starting, make sure you have correctly installed recis and the data extension column-io:
import torch
import recis
import column_io
Basic Concepts
- FeatureEngine
Feature processing engine that supports complex feature transformations
- HashTable
Core component for sparse parameter storage
- DynamicEmbedding
Dynamically expandable embedding table that supports automatic management of sparse parameters
- EmbeddingOption
Embedding table configuration options
- EmbeddingEngine
Embedding table management engine that manages multiple DynamicEmbeddings and provides optimization strategies such as sparse merging
- SparseOptimizer
Optimizer designed specifically for sparse parameters
- Trainer
Training manager
Data Processing
Data Conversion Tools
Convert CSV files to ORC format data
from recis.nn import DynamicEmbedding, EmbeddingOption
Data Reading Example
import os
from recis.io.orc_dataset import OrcDataset
worker_idx = int(os.environ.get("RANK", 0))
worker_num = int(os.environ.get("WORLD_SIZE", 1))
dataset = OrcDataset(
1024, # batch size
worker_idx=worker_idx,
worker_num=worker_num,
read_threads_num=2, # number of data reading threads
prefetch=1, # number of prefetched data
is_compressed=False,
drop_remainder=True, # drop data that doesn't fill a batch
transform_fn=[lambda x: x[0]],
dtype=torch.float32,
device="cuda", # dataset data results output directly to cuda
save_interval=None,
)
data_paths = ["./data_dir/"]
for path in data_paths:
dataset.add_path(path)
# Read fixed-length features
dataset.fixedlen_feature("label", [0.0])
# Read variable-length features
dataset.varlen_feature("user_id")
dataset.varlen_feature("item_id")
# Build data reading
iter = iter(dataset)
data = next(iter)
Feature Engineering
Basic Feature Processing
from recis.features import FeatureEngine
from recis.features.feature import Feature
from recis.features.op import SelectField, Hash, Bucketize
# Define feature processing pipeline
features = [
# User ID hash
Feature(
name="user_id",
ops=[
SelectField("user_id"),
Hash(bucket_size=100000)
]
),
# Item ID hash
Feature(
name="item_id",
ops=[
SelectField("item_id"),
Hash(bucket_size=50000)
]
),
# Age bucketing
Feature(
name="age_bucket",
ops=[
SelectField("age"),
Bucketize(boundaries=[18, 25, 35, 45, 55, 65])
]
)
]
# Create feature engine
feature_engine = FeatureEngine(features)
# Process data
input_data = {
'user_id': torch.LongTensor([[1], [2], [3]]),
'item_id': torch.LongTensor([[101], [102], [103]]),
'age': torch.FloatTensor([[25], [35], [45]])
}
processed_data = feature_engine(input_data)
print("Original data:", input_data)
print("Processed data:", processed_data)
Sparse Embedding Tables
Building Embedding Tables
Create Your First Embedding
from recis.nn import DynamicEmbedding, EmbeddingOption
# Configure embedding options
emb_opt = EmbeddingOption(
embedding_dim=64,
shared_name="my_embedding",
combiner="sum"
)
# Create dynamic embedding
embedding = DynamicEmbedding(emb_opt)
# Use embedding
ids = torch.LongTensor([[1], [2], [3], [100], [1000]])
emb_output = embedding(ids)
print(f"Input IDs: {ids}")
print(f"Embedding output shape: {emb_output.shape}")
print(f"Embedding output: {emb_output}")
Use EmbeddingEngine to Manage and Optimize Embedding Tables
from recis.nn import EmbeddingEngine, EmbeddingOption
# Configure embedding options
user_emb_opt = EmbeddingOption(
embedding_dim=64,
shared_name="user_emb",
combiner="sum"
)
id_emb_opt = EmbeddingOption(
embedding_dim=64,
shared_name="id_emb",
combiner="sum"
)
# Create dynamic embedding
embedding = EmbeddingEngine(
{"user_emb": user_emb_opt, "item_emb": id_emb_opt}
)
# Use embedding
user_ids = torch.LongTensor([[1], [2], [3], [100], [1000]])
item_ids = torch.LongTensor([[11], [22], [33], [111], [1111]])
emb_output = embedding({"user_emb": user_ids, "item_emb": item_ids})
print(f"Embedding output: {emb_output}")
Build Sparse Parameter Optimizer
from recis.optim import SparseAdamW
from recis.nn.modules.hashtable import filter_out_sparse_param
# Create a simple model
class SimpleModel(torch.nn.Module):
def __init__(self):
super().__init__()
emb_opt = EmbeddingOption(embedding_dim=32)
self.embedding = DynamicEmbedding(emb_opt)
self.linear = torch.nn.Linear(32, 1)
def forward(self, ids):
emb = self.embedding(ids)
return self.linear(emb)
model = SimpleModel()
# Separate sparse and dense parameters
sparse_params = filter_out_sparse_param(model)
print("Sparse parameters:", list(sparse_params.keys()))
# Create optimizers
sparse_optimizer = SparseAdamW(sparse_params, lr=0.001)
dense_optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
Training with Trainer
Simple Training
# Build model
# model = ...
# Build data
# dataset = ...
# Define optimizers
# sparse_params = filter_out_sparse_param(model)