forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathonline_write_benchmark.py
More file actions
100 lines (82 loc) · 2.95 KB
/
online_write_benchmark.py
File metadata and controls
100 lines (82 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import random
import string
import tempfile
from datetime import timedelta
import click
import pyarrow as pa
from tqdm import tqdm
from feast import FileSource
from feast.driver_test_data import create_driver_hourly_stats_df
from feast.entity import Entity
from feast.feature_store import FeatureStore
from feast.feature_view import FeatureView
from feast.field import Field
from feast.repo_config import RepoConfig
from feast.types import Float32, Int32
from feast.utils import _convert_arrow_to_proto, _utc_now
def create_driver_hourly_stats_feature_view(source):
driver = Entity(name="driver", join_keys=["driver_id"])
driver_stats_feature_view = FeatureView(
name="driver_stats",
entities=[driver],
schema=[
Field(name="conv_rate", dtype=Float32),
Field(name="acc_rate", dtype=Float32),
Field(name="avg_daily_trips", dtype=Int32),
],
source=source,
ttl=timedelta(hours=2),
)
return driver_stats_feature_view
def create_driver_hourly_stats_source(parquet_path):
return FileSource(
path=parquet_path,
timestamp_field="event_timestamp",
created_timestamp_column="created",
)
@click.command(name="run")
def benchmark_writes():
project_id = "test" + "".join(
random.choice(string.ascii_lowercase + string.digits) for _ in range(10)
)
with tempfile.TemporaryDirectory() as temp_dir:
store = FeatureStore(
config=RepoConfig(
registry=os.path.join(temp_dir, "registry.db"),
project=project_id,
provider="gcp",
)
)
# This is just to set data source to something, we're not reading from parquet source here.
parquet_path = os.path.join(temp_dir, "data.parquet")
driver = Entity(name="driver_id")
table = create_driver_hourly_stats_feature_view(
create_driver_hourly_stats_source(parquet_path=parquet_path)
)
store.apply([table, driver])
provider = store._get_provider()
end_date = _utc_now()
start_date = end_date - timedelta(days=14)
customers = list(range(100))
data = create_driver_hourly_stats_df(customers, start_date, end_date)
# Show the data for reference
print(data)
proto_data = _convert_arrow_to_proto(
pa.Table.from_pandas(data), table, ["driver_id"]
)
# Write it
with tqdm(total=len(proto_data)) as progress:
provider.online_write_batch(
project=store.project,
table=table,
data=proto_data,
progress=progress.update,
)
registry_tables = store.list_feature_views()
registry_entities = store.list_entities()
provider.teardown_infra(
store.project, tables=registry_tables, entities=registry_entities
)
if __name__ == "__main__":
benchmark_writes()