Everything you need to integrate DataScrub.
npm install @datascrub/sdk
# or
pip install datascrubimport { DataScrub } from '@datascrub/sdk';
const client = new DataScrub({ apiKey: 'ds_live_...' });
const result = await client.parse({
file: './report.pdf',
output: 'markdown', // 'markdown' | 'json' | 'chunks'
options: {
extractTables: true,
chunkSize: 512,
languages: ['en', 'zh'],
}
});
console.log(result.markdown);
console.log(result.chunks); // RAG-ready chunks
console.log(result.tables); // Extracted tablescurl -X POST https://api.datascrub.ai/v1/parse \
-H "Authorization: Bearer ds_live_..." \
-H "Content-Type: multipart/form-data" \
-F "file=@report.pdf" \
-F "output=markdown" \
-F "extract_tables=true"/v1/parseParse a document and return structured output.
file — File (required): PDF, PNG, DOCX, XLSX, PPTX, HTML
output — String: "markdown" | "json" | "chunks" (default: "markdown")
extract_tables — Boolean (default: true)
chunk_size — Number: tokens per chunk (default: 512)
languages — String[]: hint languages for OCR (default: ["en"])
/v1/status/{job_id}Check the status of an async parsing job.
{
"id": "parse_abc123",
"status": "completed",
"metadata": {
"filename": "report.pdf",
"pages": 12,
"language": "en",
"parse_time_ms": 2840
},
"markdown": "# Title\n\nContent...",
"json": { ... },
"chunks": [
{ "id": "chunk_001", "content": "...", "tokens": 48 }
],
"tables": [
{ "page": 2, "headers": [...], "rows": [...] }
],
"usage": {
"pages_processed": 12,
"credits_remaining": 4988
}
}