Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion splitgraph/cloud/project/dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def generate_dbt_plugin_params(repositories: List[str]) -> Tuple[Dict[str, Any],
# the Git pull URL at action runtime (using GITHUB_TOKEN).
credentials = {"git_url": "$THIS_REPO_URL"}

params = {"sources": [_make_source(r) for r in repositories]}
# Same with the branch: we want to inject the current SHA we're running the action for.
params = {"sources": [_make_source(r) for r in repositories], "git_branch": "$THIS_SHA"}

return params, credentials
5 changes: 3 additions & 2 deletions splitgraph/cloud/project/github_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def generate_job(
{
"name": "Set up dbt Git URL",
"run": 'echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && '
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:$GITHUB_TOKEN@github.com/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml',
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:$GITHUB_TOKEN@github.com/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml && '
'sed -i "s|\\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml',
"shell": "bash",
"env": {
"CREDENTIALS_YML": "${{secrets.SPLITGRAPH_CREDENTIALS_YML}}",
Expand All @@ -54,7 +55,7 @@ def generate_job(
steps.append(
{
"name": "Run sgr cloud load to set up metadata and data source settings",
"run": "sgr cloud load --remote splitgraph "
"run": "sgr cloud load --remote splitgraph --initial-private "
f"-f splitgraph.yml -f splitgraph.credentials.yml {repository}",
"shell": "bash",
}
Expand Down
114 changes: 113 additions & 1 deletion splitgraph/cloud/project/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
"""

SOURCES_YML_TEMPLATE = """# This file defines all data sources referenced by this model. The mapping
Expand Down Expand Up @@ -132,6 +132,118 @@
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
your deployment URL if you're on a private deployment).

### Edit `splitgraph.yml`

We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
parameters JSONSchema. You should review it and add suitable plugin settings:

- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
options of the data source (by default, it adds a sample table into the project file)
- change and customize the `metadata` block
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
and offers a list of alternative subobjects, choose one entry from the list and delete
the list itself, leaving the object at the top level.

Example:

```yaml
- namespace: my_namespace
repository: csv
# Catalog-specific metadata for the repository. Optional.
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
# Data source settings for the repository. Optional.
external:
# Name of the credential that the plugin uses. This can also be a credential_id if the
# credential is already registered on Splitgraph.
credential: csv
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection: # Choose one of:
- connection_type: http # REQUIRED. Constant
url: '' # REQUIRED. HTTP URL to the CSV file
- connection_type: s3 # REQUIRED. Constant
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
s3_bucket: '' # REQUIRED. Bucket the object is in
s3_region: '' # Region of the S3 bucket
s3_secure: false # Whether to use HTTPS for S3 access
s3_object: '' # Limit the import to a single object
s3_object_prefix: '' # Prefix for object in S3 bucket
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
tables:
sample_table:
# Plugin-specific table parameters matching the plugin's schema
options:
url: '' # HTTP URL to the CSV file
s3_object: '' # S3 object of the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
schema: []
# Whether live querying is enabled for the plugin (creates a "live" tag in the
# repository proxying to the data source). The plugin must support live querying.
is_live: true
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
# to trigger ingestion.
schedule:
```

becomes:

```yaml
- namespace: my_namespace
repository: csv
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
external:
# No credential required since we're querying a CSV file over HTTP
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection:
connection_type: http # REQUIRED. Constant
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Automatically infer table parameters
tables: {}
is_live: true
```

### Set up GitHub Actions

Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow
Expand Down
8 changes: 4 additions & 4 deletions splitgraph/commandline/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,15 +632,15 @@ def load_c(

repo_yaml = load_project(repositories_file)
repositories = repo_yaml.repositories
if limit_repositories:
repositories = [
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
]

gql_client = GQLAPIClient(remote)

if not skip_external:
rest_client = RESTAPIClient(remote)
if limit_repositories:
repositories = [
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
]

filter_credential_names = [
r.external.credential for r in repositories if r.external and r.external.credential
Expand Down
4 changes: 3 additions & 1 deletion splitgraph/commandline/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ def csv_import(
sample = [[str(i) for i in range(len(sample))]] + sample

type_overrides = dict(override_type or [])
sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
sg_schema = infer_sg_schema(
sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
)
logging.debug("Using Splitgraph schema: %r", sg_schema)

# Reset the stream and pass it to COPY FROM STDIN
Expand Down
9 changes: 6 additions & 3 deletions splitgraph/ingestion/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ def parse_json(json_s: str):
]


def _infer_column_schema(column_sample: Sequence[str]) -> str:
def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
for candidate, converter in _CONVERTERS:
try:
seen_value = False
for c in column_sample:
if c == "" or c is None:
if (c == "" and ignore_empty_strings) or c is None:
continue

seen_value = True
Expand All @@ -73,6 +73,7 @@ def infer_sg_schema(
sample: Sequence[List[str]],
override_types: Optional[Dict[str, str]] = None,
primary_keys: Optional[List[str]] = None,
ignore_empty_strings: bool = True,
):
override_types = override_types or {}
primary_keys = primary_keys or []
Expand All @@ -92,7 +93,9 @@ def infer_sg_schema(
)

for i, (c_name, c_sample) in enumerate(zip(header, columns)):
pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
pg_type = override_types.get(
c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
)

result.append(
TableColumn(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ models:
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
- name: Run sgr cloud load to set up metadata and data source settings
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
myns/postgres_fdw
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
-f splitgraph.credentials.yml myns/postgres_fdw
shell: bash
myns_airbyte_postgres:
name: Build myns/airbyte-postgres
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
- name: Run sgr cloud load to set up metadata and data source settings
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
myns/postgres_fdw
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
-f splitgraph.credentials.yml myns/postgres_fdw
shell: bash
myns_airbyte_postgres:
name: Build myns/airbyte-postgres
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
splitgraph_api_secret: ${{ secrets.SPLITGRAPH_API_SECRET }}
- name: Set up dbt Git URL
run: echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && sed -i "s|\$THIS_REPO_URL|https://$GITHUB_ACTOR:$GITHUB_TOKEN@github.com/$GITHUB_REPOSITORY|g"
splitgraph.credentials.yml
splitgraph.credentials.yml && sed -i "s|\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml
shell: bash
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,118 @@ repository and create the following secrets:
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
your deployment URL if you're on a private deployment).

### Edit `splitgraph.yml`

We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
parameters JSONSchema. You should review it and add suitable plugin settings:

- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
options of the data source (by default, it adds a sample table into the project file)
- change and customize the `metadata` block
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
and offers a list of alternative subobjects, choose one entry from the list and delete
the list itself, leaving the object at the top level.

Example:

```yaml
- namespace: my_namespace
repository: csv
# Catalog-specific metadata for the repository. Optional.
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
# Data source settings for the repository. Optional.
external:
# Name of the credential that the plugin uses. This can also be a credential_id if the
# credential is already registered on Splitgraph.
credential: csv
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection: # Choose one of:
- connection_type: http # REQUIRED. Constant
url: '' # REQUIRED. HTTP URL to the CSV file
- connection_type: s3 # REQUIRED. Constant
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
s3_bucket: '' # REQUIRED. Bucket the object is in
s3_region: '' # Region of the S3 bucket
s3_secure: false # Whether to use HTTPS for S3 access
s3_object: '' # Limit the import to a single object
s3_object_prefix: '' # Prefix for object in S3 bucket
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
tables:
sample_table:
# Plugin-specific table parameters matching the plugin's schema
options:
url: '' # HTTP URL to the CSV file
s3_object: '' # S3 object of the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
schema: []
# Whether live querying is enabled for the plugin (creates a "live" tag in the
# repository proxying to the data source). The plugin must support live querying.
is_live: true
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
# to trigger ingestion.
schedule:
```

becomes:

```yaml
- namespace: my_namespace
repository: csv
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
external:
# No credential required since we're querying a CSV file over HTTP
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection:
connection_type: http # REQUIRED. Constant
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Automatically infer table parameters
tables: {}
is_live: true
```

### Set up GitHub Actions

Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ models:
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ repositories:
namespace: myns
repository: airbyte-postgres
hash_or_tag: latest
git_branch: $THIS_SHA
is_live: false
tables: {}
metadata:
Expand Down
Loading