Docker: add entrypoint script for upload single dataset #53
No reviewers
Labels
No labels
Blocked
Bounty
$100
Bounty
$1000
Bounty
$10000
Bounty
$20
Bounty
$2000
Bounty
$250
Bounty
$50
Bounty
$500
Bounty
$5000
Bounty
$750
MoSCoW
Could have
MoSCoW
Must have
MoSCoW
Should have
Needs feedback
Points
1
Points
13
Points
2
Points
21
Points
3
Points
34
Points
5
Points
55
Points
8
Points
88
Priority
Backlog
Priority
Critical
Priority
High
Priority
Low
Priority
Medium
Signed-off: Owner
Signed-off: Scrum Master
Signed-off: Tech Lead
Spike
State
Completed
State
Duplicate
State
In Progress
State
In Review
State
Paused
State
Unverified
State
Verified
State
Wont Do
Type
Bug
Type
Discussion
Type
Documentation
Type
Epic
Type
Feature
Type
Legendary
Type
Support
Type
Task
Type
Testing
No milestone
No project
No assignees
2 participants
Notifications
Due date
No due date set.
Dependencies
No dependencies set.
Reference
cleverdatasets/dataset-uploader!53
Loading…
Add table
Add a link
Reference in a new issue
No description provided.
Delete branch "streaming-upload-docker"
Deleting a branch is permanent. Although the deleted branch may continue to exist for a short time before it actually gets removed, it CANNOT be undone in most cases. Continue?
The code contain entry point script for upload single dataset.
d3d581dfe9to415ce6baaf2e68a12951to7069bbdd93When I run
behave features, I get the following summary:I can't approve until the tests pass.
@ -122,0 +127,4 @@continue # Retry# All retries failed - try to salvage by parsing line-by-linelogger.warning(f"Chunk {idx} failed after {max_retries} attempts, attempting line-by-line recovery: {last_error}")ruff checkreports:fixed !!
@ -253,0 +292,4 @@]if triples:yield triplesparsed_successfully = Trueruff checkreports:fixed !!
@ -253,0 +298,4 @@if attempt < max_retries - 1:continue # Retryelse:logger.warning(f"Chunk parsing failed after {max_retries} attempts: {e}")ruff checkreports:fixed !!
@ -278,0 +323,4 @@try:graph = Graph()graph.parse(data=chunk_text, format="turtle")triples = [_convert_rdf_triple_to_dict(s, p, o) for s, p, o in graph]ruff checkreports:fixed !!
@ -278,0 +331,4 @@if attempt < max_retries - 1:continue # Retryelse:logger.warning(f"Final chunk parsing failed after {max_retries} attempts: {e}")ruff checkreports:fixed !!
@ -984,0 +1054,4 @@or "gateway timeout" in error_str# Network/connection errorsor "connection" in error_str and "refused" in error_strruff checkreports:fixed !!
@ -984,0 +1064,4 @@or "timeout" in error_type.lower()# SSL/TLS temporary issuesor "ssl" in error_str and "error" in error_strruff checkreports:fixed !!
@ -984,0 +1067,4 @@or "ssl" in error_str and "error" in error_str# Proxy errorsor "proxy" in error_str and "error" in error_strruff checkreports:fixed !!
@ -1004,3 +1096,3 @@raiseelse:# Not a rate limit error, re-raise immediately# Not a retryable error (authentication, permission, etc.) - fail immediatelyruff checkreports:fixed !!
@ -1826,2 +1898,2 @@"for the dataset[/dim]")# Upload README to the repository with retry logicimport ioIt's usually better to put all
importstatements together.fixed !!
@ -1829,0 +1915,4 @@commit_message="Add dataset card with documentation")console.print("[green]✓ Dataset card (README.md) uploaded[/green]")readme_uploaded = Trueruff checkreports:fixed !!
@ -1829,0 +1943,4 @@# Network/connection errorsor "connection" in error_str and "refused" in error_stror "connection" in error_str and "reset" in error_stror "connection" in error_str and "timeout" in error_strruff checkreports:fixed !!
@ -1829,0 +1951,4 @@or "timeout" in error_type.lower()# SSL/TLS temporary issuesor "ssl" in error_str and "error" in error_strruff checkreports:fixed !!
@ -1829,0 +1954,4 @@or "ssl" in error_str and "error" in error_str# Proxy errorsor "proxy" in error_str and "error" in error_strruff checkreports:fixed !!
@ -1829,0 +1976,4 @@time.sleep(wait_time)else:console.print(f"[yellow]⚠ Warning: Could not upload README.md after {max_retries} attempts: "ruff checkreports:fixed !!
@ -1829,0 +1986,4 @@else:# Not a retryable error - fail immediatelyconsole.print(f"[yellow]⚠ Warning: Could not upload README.md (non-retryable error): "ruff checkreports:fixed !!
@ -1829,0 +1993,4 @@"[dim]You can manually create a README.md file ""for the dataset[/dim]")breakLines 1035-1099 and lines 1922-1996 are extremely similar. Could they be unified into a function?
fixed !!
A lot of your new code didn't pass
ruff check.@ -523,2 +582,2 @@graph = Graph()graph.parse(str(file_path), format=rdf_format)# Check if this is GeoNames format (special handling required)if rdf_format in ("xml", "application/rdf+xml") and _detect_geonames_format(file_path):Crap. I'm sorry to tell you this, but...
ruff checkreports:Let me know if you're having problems running
ruff checklocally.fixed !!
@ -828,0 +965,4 @@subject=subject,predicate=predicate,object=obj,object_type="literal", # TSV typically has literalsruff checkreports:fixed !!
@ -828,0 +970,4 @@object_language=None))except Exception as e:logger.debug(f"Skipping malformed TSV line {line_num}: {e}")ruff checkreports:fixed !!
@ -840,0 +990,4 @@# Create train/test split if requestedif config.create_train_test_split:assert isinstance(dataset, Dataset), "Expected Dataset instance"train_test = dataset.train_test_split(test_size=config.test_size, seed=42)ruff checkreports:fixed !!
@ -1161,1 +1324,3 @@)# Handle compressed filesis_gzip = config.input_path.suffix == ".gz" or str(config.input_path).endswith(".gz")is_bz2 = config.input_path.suffix == ".bz2" or str(config.input_path).endswith(".bz2")ruff checkreports:fixed !!
@ -1162,0 +1327,4 @@try:if is_gzip:file_obj = gzip.open(config.input_path, "rt", encoding="utf-8", errors="ignore") # noqa: SIM115ruff checkreports:fixed !!
@ -1162,0 +1329,4 @@if is_gzip:file_obj = gzip.open(config.input_path, "rt", encoding="utf-8", errors="ignore") # noqa: SIM115elif is_bz2:file_obj = bz2.open(config.input_path, "rt", encoding="utf-8", errors="ignore") # noqa: SIM115ruff checkreports:fixed !!
@ -1165,3 +1334,1 @@f"[yellow]Using standard RDF parser for "f"{config.rdf_format} (single-threaded)[/yellow]")file_obj = open(config.input_path, encoding="utf-8", errors="ignore") # noqa: SIM115ruff checkreports:fixed !!
@ -1168,0 +1335,4 @@try:first_line = file_obj.readline().strip()if first_line.startswith("http://") or first_line.startswith("https://"):I don't know why
ruff checkis ignoring this line this time around, but it's 98 characters long, longer thanruff checks usual 88 characters.fixed !!
@ -1168,0 +1350,4 @@finally:file_obj.close()except Exception as e:logger.warning(f"Error detecting GeoNames format: {e}, using generic parser")ruff checkreports:fixed !!
@ -1249,0 +1435,4 @@# Create train/test split if requestedif config.create_train_test_split:assert isinstance(dataset, Dataset), "Expected Dataset instance"train_test = dataset.train_test_split(test_size=config.test_size, seed=42)Look. You're able to run
ruff checkjust as well as I can. I'm just going to report thatruff checkfailed in the code that you wrote. Please look through the rest of the code that you're adding.fixed !!
5e3be91d50to56abec9431View command line instructions
Checkout
From your project repository, check out a new branch and test the changes.Merge
Merge the changes and update on Forgejo.Warning: The "Autodetect manual merge" setting is not enabled for this repository, you will have to mark this pull request as manually merged afterwards.