Audio plugin host https://kx.studio/carla
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

reader.c 40KB

12 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585
  1. /*
  2. Copyright 2011-2012 David Robillard <http://drobilla.net>
  3. Permission to use, copy, modify, and/or distribute this software for any
  4. purpose with or without fee is hereby granted, provided that the above
  5. copyright notice and this permission notice appear in all copies.
  6. THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  7. WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  8. MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  9. ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  10. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  11. ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  12. OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  13. */
  14. #include "serd_internal.h"
  15. #include <assert.h>
  16. #include <errno.h>
  17. #include <stdarg.h>
  18. #include <stdint.h>
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. #define NS_XSD "http://www.w3.org/2001/XMLSchema#"
  23. #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  24. #define TRY_THROW(exp) if (!(exp)) goto except;
  25. #define TRY_RET(exp) if (!(exp)) return 0;
  26. #ifdef SERD_STACK_CHECK
  27. # define SERD_STACK_ASSERT_TOP(reader, ref) \
  28. assert(ref == reader->allocs[reader->n_allocs - 1]);
  29. #else
  30. # define SERD_STACK_ASSERT_TOP(reader, ref)
  31. #endif
  32. typedef struct {
  33. const uint8_t* filename;
  34. unsigned line;
  35. unsigned col;
  36. } Cursor;
  37. typedef uint32_t uchar;
  38. /* Reference to a node in the stack (we can not use pointers since the
  39. stack may be reallocated, invalidating any pointers to elements).
  40. */
  41. typedef size_t Ref;
  42. typedef struct {
  43. Ref graph;
  44. Ref subject;
  45. Ref predicate;
  46. SerdStatementFlags* flags;
  47. } ReadContext;
  48. struct SerdReaderImpl {
  49. void* handle;
  50. void (*free_handle)(void* ptr);
  51. SerdBaseSink base_sink;
  52. SerdPrefixSink prefix_sink;
  53. SerdStatementSink statement_sink;
  54. SerdEndSink end_sink;
  55. SerdErrorSink error_sink;
  56. void* error_handle;
  57. Ref rdf_first;
  58. Ref rdf_rest;
  59. Ref rdf_nil;
  60. SerdNode default_graph;
  61. FILE* fd;
  62. SerdStack stack;
  63. SerdSyntax syntax;
  64. Cursor cur;
  65. uint8_t* buf;
  66. uint8_t* bprefix;
  67. size_t bprefix_len;
  68. unsigned next_id;
  69. uint8_t* read_buf;
  70. int32_t read_head; ///< Offset into read_buf
  71. uint8_t read_byte; ///< 1-byte 'buffer' used when not paging
  72. bool from_file; ///< True iff reading from @ref fd
  73. bool paging; ///< True iff reading a page at a time
  74. bool eof;
  75. bool seen_genid;
  76. #ifdef SERD_STACK_CHECK
  77. Ref* allocs; ///< Stack of push offsets
  78. size_t n_allocs; ///< Number of stack pushes
  79. #endif
  80. };
  81. static int
  82. r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...)
  83. {
  84. va_list args;
  85. va_start(args, fmt);
  86. const SerdError e = {
  87. st, reader->cur.filename, reader->cur.line, reader->cur.col, fmt, &args
  88. };
  89. serd_error(reader->error_sink, reader->error_handle, &e);
  90. va_end(args);
  91. return 0;
  92. }
  93. static inline SerdStatus
  94. page(SerdReader* reader)
  95. {
  96. reader->read_head = 0;
  97. size_t n_read = fread(reader->read_buf, 1, SERD_PAGE_SIZE, reader->fd);
  98. if (n_read == 0) {
  99. reader->read_buf[0] = '\0';
  100. reader->eof = true;
  101. return ferror(reader->fd) ? SERD_ERR_UNKNOWN : SERD_FAILURE;
  102. } else if (n_read < SERD_PAGE_SIZE) {
  103. reader->read_buf[n_read] = '\0';
  104. }
  105. return SERD_SUCCESS;
  106. }
  107. static inline uint8_t
  108. peek_byte(SerdReader* reader)
  109. {
  110. return reader->read_buf[reader->read_head];
  111. }
  112. static inline uint8_t
  113. eat_byte_safe(SerdReader* reader, const uint8_t byte)
  114. {
  115. assert(peek_byte(reader) == byte);
  116. switch (byte) {
  117. case '\0': reader->eof = true; break;
  118. case '\n': ++reader->cur.line; reader->cur.col = 0; break;
  119. default: ++reader->cur.col;
  120. }
  121. if (reader->from_file && !reader->paging) {
  122. const int c = fgetc(reader->fd);
  123. reader->read_byte = (c == EOF) ? 0 : (uint8_t)c;
  124. if (c == EOF) {
  125. reader->eof = true;
  126. }
  127. } else if (++reader->read_head == SERD_PAGE_SIZE && reader->paging) {
  128. page(reader);
  129. }
  130. return byte;
  131. }
  132. static inline uint8_t
  133. eat_byte_check(SerdReader* reader, const uint8_t byte)
  134. {
  135. const uint8_t c = peek_byte(reader);
  136. if (c != byte) {
  137. return r_err(reader, SERD_ERR_BAD_SYNTAX,
  138. "expected `%c', not `%c'\n", byte, c);
  139. }
  140. return eat_byte_safe(reader, byte);
  141. }
  142. static inline void
  143. eat_string(SerdReader* reader, const char* str, unsigned n)
  144. {
  145. for (unsigned i = 0; i < n; ++i) {
  146. eat_byte_check(reader, ((const uint8_t*)str)[i]);
  147. }
  148. }
  149. static Ref
  150. push_node_padded(SerdReader* reader, size_t maxlen,
  151. SerdType type, const char* str, size_t n_bytes)
  152. {
  153. uint8_t* mem = serd_stack_push(&reader->stack,
  154. sizeof(SerdNode) + maxlen + 1);
  155. SerdNode* const node = (SerdNode*)mem;
  156. node->n_bytes = node->n_chars = n_bytes;
  157. node->flags = 0;
  158. node->type = type;
  159. node->buf = NULL;
  160. uint8_t* buf = mem + sizeof(SerdNode);
  161. memcpy(buf, str, n_bytes + 1);
  162. #ifdef SERD_STACK_CHECK
  163. reader->allocs = realloc(
  164. reader->allocs, sizeof(uint8_t*) * (++reader->n_allocs));
  165. reader->allocs[reader->n_allocs - 1] = (mem - reader->stack.buf);
  166. #endif
  167. return (uint8_t*)node - reader->stack.buf;
  168. }
  169. static Ref
  170. push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes)
  171. {
  172. return push_node_padded(reader, n_bytes, type, str, n_bytes);
  173. }
  174. static inline SerdNode*
  175. deref(SerdReader* reader, const Ref ref)
  176. {
  177. if (ref) {
  178. SerdNode* node = (SerdNode*)(reader->stack.buf + ref);
  179. node->buf = (uint8_t*)node + sizeof(SerdNode);
  180. return node;
  181. }
  182. return NULL;
  183. }
  184. static inline void
  185. push_byte(SerdReader* reader, Ref ref, const uint8_t c)
  186. {
  187. SERD_STACK_ASSERT_TOP(reader, ref);
  188. uint8_t* const s = serd_stack_push(&reader->stack, 1);
  189. SerdNode* const node = (SerdNode*)(reader->stack.buf + ref);
  190. ++node->n_bytes;
  191. if (!(c & 0x80)) { // Starts with 0 bit, start of new character
  192. ++node->n_chars;
  193. }
  194. *(s - 1) = c;
  195. *s = '\0';
  196. }
  197. static inline void
  198. push_replacement(SerdReader* reader, Ref dest)
  199. {
  200. push_byte(reader, dest, 0xEF);
  201. push_byte(reader, dest, 0xBF);
  202. push_byte(reader, dest, 0xBD);
  203. }
  204. static Ref
  205. pop_node(SerdReader* reader, Ref ref)
  206. {
  207. if (ref && ref != reader->rdf_first && ref != reader->rdf_rest
  208. && ref != reader->rdf_nil) {
  209. #ifdef SERD_STACK_CHECK
  210. SERD_STACK_ASSERT_TOP(reader, ref);
  211. --reader->n_allocs;
  212. #endif
  213. SerdNode* const node = deref(reader, ref);
  214. uint8_t* const top = reader->stack.buf + reader->stack.size;
  215. serd_stack_pop(&reader->stack, top - (uint8_t*)node);
  216. }
  217. return 0;
  218. }
  219. static inline bool
  220. emit_statement(SerdReader* reader, ReadContext ctx, Ref o, Ref d, Ref l)
  221. {
  222. SerdNode* graph = deref(reader, ctx.graph);
  223. if (!graph && reader->default_graph.buf) {
  224. graph = &reader->default_graph;
  225. }
  226. bool ret = !reader->statement_sink ||
  227. !reader->statement_sink(
  228. reader->handle, *ctx.flags, graph,
  229. deref(reader, ctx.subject), deref(reader, ctx.predicate),
  230. deref(reader, o), deref(reader, d), deref(reader, l));
  231. *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags
  232. return ret;
  233. }
  234. static bool
  235. read_collection(SerdReader* reader, ReadContext ctx, Ref* dest);
  236. static bool
  237. read_predicateObjectList(SerdReader* reader, ReadContext ctx);
  238. // [40] hex ::= [#x30-#x39] | [#x41-#x46]
  239. static inline uint8_t
  240. read_hex(SerdReader* reader)
  241. {
  242. const uint8_t c = peek_byte(reader);
  243. if (in_range(c, 0x30, 0x39) || in_range(c, 0x41, 0x46)) {
  244. return eat_byte_safe(reader, c);
  245. } else {
  246. return r_err(reader, SERD_ERR_BAD_SYNTAX,
  247. "invalid hexadecimal digit `%c'\n", c);
  248. }
  249. }
  250. static inline bool
  251. read_hex_escape(SerdReader* reader, unsigned length, Ref dest)
  252. {
  253. uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  254. for (unsigned i = 0; i < length; ++i) {
  255. if (!(buf[i] = read_hex(reader))) {
  256. return false;
  257. }
  258. }
  259. uint32_t c;
  260. sscanf((const char*)buf, "%X", &c);
  261. unsigned size = 0;
  262. if (c < 0x00000080) {
  263. size = 1;
  264. } else if (c < 0x00000800) {
  265. size = 2;
  266. } else if (c < 0x00010000) {
  267. size = 3;
  268. } else if (c < 0x00110000) {
  269. size = 4;
  270. } else {
  271. r_err(reader, SERD_ERR_BAD_SYNTAX,
  272. "unicode character 0x%X out of range\n", c);
  273. push_replacement(reader, dest);
  274. return true;
  275. }
  276. // Build output in buf
  277. // (Note # of bytes = # of leading 1 bits in first byte)
  278. switch (size) {
  279. case 4:
  280. buf[3] = 0x80 | (uint8_t)(c & 0x3F);
  281. c >>= 6;
  282. c |= (16 << 12); // set bit 4
  283. case 3:
  284. buf[2] = 0x80 | (uint8_t)(c & 0x3F);
  285. c >>= 6;
  286. c |= (32 << 6); // set bit 5
  287. case 2:
  288. buf[1] = 0x80 | (uint8_t)(c & 0x3F);
  289. c >>= 6;
  290. c |= 0xC0; // set bits 6 and 7
  291. case 1:
  292. buf[0] = (uint8_t)c;
  293. }
  294. for (unsigned i = 0; i < size; ++i) {
  295. push_byte(reader, dest, buf[i]);
  296. }
  297. return true;
  298. }
  299. static inline bool
  300. read_character_escape(SerdReader* reader, Ref dest)
  301. {
  302. switch (peek_byte(reader)) {
  303. case '\\':
  304. push_byte(reader, dest, eat_byte_safe(reader, '\\'));
  305. return true;
  306. case 'u':
  307. eat_byte_safe(reader, 'u');
  308. return read_hex_escape(reader, 4, dest);
  309. case 'U':
  310. eat_byte_safe(reader, 'U');
  311. return read_hex_escape(reader, 8, dest);
  312. default:
  313. return false;
  314. }
  315. }
  316. static inline bool
  317. read_echaracter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
  318. {
  319. switch (peek_byte(reader)) {
  320. case 't':
  321. eat_byte_safe(reader, 't');
  322. push_byte(reader, dest, '\t');
  323. return true;
  324. case 'n':
  325. *flags |= SERD_HAS_NEWLINE;
  326. eat_byte_safe(reader, 'n');
  327. push_byte(reader, dest, '\n');
  328. return true;
  329. case 'r':
  330. *flags |= SERD_HAS_NEWLINE;
  331. eat_byte_safe(reader, 'r');
  332. push_byte(reader, dest, '\r');
  333. return true;
  334. default:
  335. return read_character_escape(reader, dest);
  336. }
  337. }
  338. static inline bool
  339. read_scharacter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
  340. {
  341. switch (peek_byte(reader)) {
  342. case '"':
  343. *flags |= SERD_HAS_QUOTE;
  344. push_byte(reader, dest, eat_byte_safe(reader, '"'));
  345. return true;
  346. default:
  347. return read_echaracter_escape(reader, dest, flags);
  348. }
  349. }
  350. static inline bool
  351. read_ucharacter_escape(SerdReader* reader, Ref dest)
  352. {
  353. SerdNodeFlags flags = 0;
  354. switch (peek_byte(reader)) {
  355. case '>':
  356. push_byte(reader, dest, eat_byte_safe(reader, '>'));
  357. return true;
  358. default:
  359. return read_echaracter_escape(reader, dest, &flags);
  360. }
  361. }
  362. static inline SerdStatus
  363. bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
  364. {
  365. r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
  366. push_replacement(reader, dest);
  367. // Skip bytes until the next start byte
  368. for (uint8_t b = peek_byte(reader); (b & 0x80);) {
  369. eat_byte_safe(reader, b);
  370. b = peek_byte(reader);
  371. }
  372. return SERD_SUCCESS;
  373. }
  374. static SerdStatus
  375. read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
  376. {
  377. unsigned size = 1;
  378. if ((c & 0xE0) == 0xC0) { // Starts with `110'
  379. size = 2;
  380. } else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
  381. size = 3;
  382. } else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
  383. size = 4;
  384. } else {
  385. return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n",
  386. eat_byte_safe(reader, c));
  387. }
  388. char bytes[4];
  389. bytes[0] = eat_byte_safe(reader, c);
  390. // Check character validity
  391. for (unsigned i = 1; i < size; ++i) {
  392. if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) {
  393. return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n",
  394. bytes[i]);
  395. }
  396. eat_byte_safe(reader, bytes[i]);
  397. }
  398. // Emit character
  399. for (unsigned i = 0; i < size; ++i) {
  400. push_byte(reader, dest, bytes[i]);
  401. }
  402. return SERD_SUCCESS;
  403. }
  404. // [38] character ::= '\u' hex hex hex hex
  405. // | '\U' hex hex hex hex hex hex hex hex
  406. // | '\\'
  407. // | [#x20-#x5B] | [#x5D-#x10FFFF]
  408. static inline SerdStatus
  409. read_character(SerdReader* reader, Ref dest)
  410. {
  411. const uint8_t c = peek_byte(reader);
  412. assert(c != '\\'); // Only called from methods that handle escapes first
  413. if (c == '\0') {
  414. r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected end of input\n", c);
  415. return SERD_ERR_BAD_SYNTAX;
  416. } else if (c < 0x20) {
  417. return bad_char(reader, dest,
  418. "unexpected control character 0x%X\n",
  419. eat_byte_safe(reader, c));
  420. } else if (!(c & 0x80)) {
  421. push_byte(reader, dest, eat_byte_safe(reader, c));
  422. return SERD_SUCCESS;
  423. } else {
  424. return read_utf8_character(reader, dest, c);
  425. }
  426. }
  427. // [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
  428. static inline SerdStatus
  429. read_lcharacter(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
  430. {
  431. const uint8_t c = peek_byte(reader);
  432. uint8_t buf[2];
  433. switch (c) {
  434. case '"':
  435. eat_byte_safe(reader, '\"');
  436. buf[0] = eat_byte_safe(reader, peek_byte(reader));
  437. buf[1] = eat_byte_safe(reader, peek_byte(reader));
  438. if (buf[0] == '\"' && buf[1] == '\"') {
  439. return SERD_FAILURE;
  440. } else {
  441. *flags |= SERD_HAS_QUOTE;
  442. push_byte(reader, dest, c);
  443. push_byte(reader, dest, buf[0]);
  444. push_byte(reader, dest, buf[1]);
  445. return SERD_SUCCESS;
  446. }
  447. case '\\':
  448. eat_byte_safe(reader, '\\');
  449. if (read_scharacter_escape(reader, dest, flags)) {
  450. return SERD_SUCCESS;
  451. } else {
  452. r_err(reader, SERD_ERR_BAD_SYNTAX,
  453. "invalid escape `\\%c'\n", peek_byte(reader));
  454. return SERD_ERR_BAD_SYNTAX;
  455. }
  456. case 0xA: case 0xD:
  457. *flags |= SERD_HAS_NEWLINE;
  458. case 0x9:
  459. push_byte(reader, dest, eat_byte_safe(reader, c));
  460. return SERD_SUCCESS;
  461. default:
  462. return read_character(reader, dest);
  463. }
  464. }
  465. // [42] scharacter ::= ( echaracter - #x22 ) | '\"'
  466. static inline SerdStatus
  467. read_scharacter(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
  468. {
  469. uint8_t c = peek_byte(reader);
  470. switch (c) {
  471. case '\\':
  472. eat_byte_safe(reader, '\\');
  473. if (read_scharacter_escape(reader, dest, flags)) {
  474. return SERD_SUCCESS;
  475. } else {
  476. r_err(reader, SERD_ERR_BAD_SYNTAX,
  477. "invalid escape `\\%c'\n", peek_byte(reader));
  478. return SERD_ERR_BAD_SYNTAX;
  479. }
  480. case '\"':
  481. return SERD_FAILURE;
  482. default:
  483. return read_character(reader, dest);
  484. }
  485. }
  486. // Spec: [41] ucharacter ::= ( character - #x3E ) | '\>'
  487. // Impl: [41] ucharacter ::= ( echaracter - #x3E ) | '\>'
  488. static inline SerdStatus
  489. read_ucharacter(SerdReader* reader, Ref dest)
  490. {
  491. const uint8_t c = peek_byte(reader);
  492. switch (c) {
  493. case '\\':
  494. eat_byte_safe(reader, '\\');
  495. if (read_ucharacter_escape(reader, dest)) {
  496. return SERD_SUCCESS;
  497. } else {
  498. r_err(reader, SERD_ERR_BAD_SYNTAX,
  499. "invalid escape `\\%c'\n", peek_byte(reader));
  500. return SERD_FAILURE;
  501. }
  502. case '>':
  503. return SERD_FAILURE;
  504. default:
  505. return read_character(reader, dest);
  506. }
  507. }
  508. // [10] comment ::= '#' ( [^#xA #xD] )*
  509. static void
  510. read_comment(SerdReader* reader)
  511. {
  512. eat_byte_safe(reader, '#');
  513. uint8_t c;
  514. while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) {
  515. eat_byte_safe(reader, c);
  516. }
  517. }
  518. // [24] ws ::= #x9 | #xA | #xD | #x20 | comment
  519. static inline bool
  520. read_ws(SerdReader* reader)
  521. {
  522. const uint8_t c = peek_byte(reader);
  523. switch (c) {
  524. case 0x9: case 0xA: case 0xD: case 0x20:
  525. eat_byte_safe(reader, c);
  526. return true;
  527. case '#':
  528. read_comment(reader);
  529. return true;
  530. default:
  531. return false;
  532. }
  533. }
  534. static inline bool
  535. read_ws_star(SerdReader* reader)
  536. {
  537. while (read_ws(reader)) {}
  538. return true;
  539. }
  540. static inline bool
  541. read_ws_plus(SerdReader* reader)
  542. {
  543. TRY_RET(read_ws(reader));
  544. return read_ws_star(reader);
  545. }
  546. static inline bool
  547. peek_delim(SerdReader* reader, const char delim)
  548. {
  549. read_ws_star(reader);
  550. return peek_byte(reader) == delim;
  551. }
  552. static inline bool
  553. eat_delim(SerdReader* reader, const char delim)
  554. {
  555. if (peek_delim(reader, delim)) {
  556. eat_byte_safe(reader, delim);
  557. return read_ws_star(reader);
  558. }
  559. return false;
  560. }
  561. // [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
  562. static Ref
  563. read_longString(SerdReader* reader, SerdNodeFlags* flags)
  564. {
  565. Ref ref = push_node(reader, SERD_LITERAL, "", 0);
  566. SerdStatus st;
  567. while (!(st = read_lcharacter(reader, ref, flags))) {}
  568. if (st < SERD_ERR_UNKNOWN) {
  569. return ref;
  570. }
  571. return pop_node(reader, ref);
  572. }
  573. // [36] string ::= #x22 scharacter* #x22
  574. static Ref
  575. read_string(SerdReader* reader, SerdNodeFlags* flags)
  576. {
  577. Ref ref = push_node(reader, SERD_LITERAL, "", 0);
  578. SerdStatus st;
  579. while (!(st = read_scharacter(reader, ref, flags))) {}
  580. if (st < SERD_ERR_UNKNOWN) {
  581. eat_byte_check(reader, '\"');
  582. return ref;
  583. }
  584. return pop_node(reader, ref);
  585. }
  586. // [35] quotedString ::= string | longString
  587. static Ref
  588. read_quotedString(SerdReader* reader, SerdNodeFlags* flags)
  589. {
  590. eat_byte_safe(reader, '\"'); // q1
  591. const uint8_t q2 = peek_byte(reader);
  592. if (q2 != '\"') { // Non-empty single-quoted string
  593. return read_string(reader, flags);
  594. }
  595. eat_byte_safe(reader, q2);
  596. const uint8_t q3 = peek_byte(reader);
  597. if (q3 != '\"') { // Empty single-quoted string
  598. return push_node(reader, SERD_LITERAL, "", 0);
  599. }
  600. eat_byte_safe(reader, '\"');
  601. return read_longString(reader, flags);
  602. }
  603. // [34] relativeURI ::= ucharacter*
  604. static inline Ref
  605. read_relativeURI(SerdReader* reader)
  606. {
  607. Ref ref = push_node(reader, SERD_URI, "", 0);
  608. SerdStatus st;
  609. while (!(st = read_ucharacter(reader, ref))) {}
  610. if (st < SERD_ERR_UNKNOWN) {
  611. return ref;
  612. }
  613. return pop_node(reader, ref);
  614. }
  615. // [30] nameStartChar ::= [A-Z] | "_" | [a-z]
  616. // | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D]
  617. // | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
  618. // | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  619. static inline uchar
  620. read_nameStartChar(SerdReader* reader)
  621. {
  622. const uint8_t c = peek_byte(reader);
  623. if (c == '_' || is_alpha(c) || is_digit(c)) { // TODO: Not correct
  624. return eat_byte_safe(reader, c);
  625. }
  626. return 0;
  627. }
  628. // [31] nameChar ::= nameStartChar | '-' | [0-9]
  629. // | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
  630. static inline uchar
  631. read_nameChar(SerdReader* reader)
  632. {
  633. uchar c = read_nameStartChar(reader);
  634. if (c)
  635. return c;
  636. switch ((c = peek_byte(reader))) {
  637. case '-': case 0xB7: case '0': case '1': case '2': case '3': case '4':
  638. case '5': case '6': case '7': case '8': case '9':
  639. return eat_byte_safe(reader, c);
  640. default: // TODO: 0x300-0x036F | 0x203F-0x2040
  641. return 0;
  642. }
  643. return 0;
  644. }
  645. // [33] prefixName ::= ( nameStartChar - '_' ) nameChar*
  646. static Ref
  647. read_prefixName(SerdReader* reader, Ref dest)
  648. {
  649. uint8_t c = peek_byte(reader);
  650. if (c == '_') {
  651. r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `_'\n");
  652. return pop_node(reader, dest);
  653. }
  654. TRY_RET(c = read_nameStartChar(reader));
  655. if (!dest) {
  656. dest = push_node(reader, SERD_CURIE, "", 0);
  657. }
  658. push_byte(reader, dest, c);
  659. while ((c = read_nameChar(reader))) {
  660. push_byte(reader, dest, c);
  661. }
  662. return dest;
  663. }
  664. // [32] name ::= nameStartChar nameChar*
  665. static Ref
  666. read_name(SerdReader* reader, Ref dest)
  667. {
  668. uchar c = read_nameStartChar(reader);
  669. if (!c) {
  670. return 0;
  671. }
  672. do {
  673. push_byte(reader, dest, c);
  674. } while ((c = read_nameChar(reader)) != 0);
  675. return dest;
  676. }
  677. // [29] language ::= [a-z]+ ('-' [a-z0-9]+ )*
  678. static Ref
  679. read_language(SerdReader* reader)
  680. {
  681. uint8_t c = peek_byte(reader);
  682. if (!in_range(c, 'a', 'z')) {
  683. return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c);
  684. }
  685. Ref ref = push_node(reader, SERD_LITERAL, "", 0);
  686. push_byte(reader, ref, eat_byte_safe(reader, c));
  687. while ((c = peek_byte(reader)) && in_range(c, 'a', 'z')) {
  688. push_byte(reader, ref, eat_byte_safe(reader, c));
  689. }
  690. while (peek_byte(reader) == '-') {
  691. push_byte(reader, ref, eat_byte_safe(reader, '-'));
  692. while ((c = peek_byte(reader)) && (
  693. in_range(c, 'a', 'z') || in_range(c, '0', '9'))) {
  694. push_byte(reader, ref, eat_byte_safe(reader, c));
  695. }
  696. }
  697. return ref;
  698. }
  699. // [28] uriref ::= '<' relativeURI '>'
  700. static Ref
  701. read_uriref(SerdReader* reader)
  702. {
  703. TRY_RET(eat_byte_check(reader, '<'));
  704. Ref const str = read_relativeURI(reader);
  705. if (str && eat_byte_check(reader, '>')) {
  706. return str;
  707. }
  708. return pop_node(reader, str);
  709. }
  710. // [27] qname ::= prefixName? ':' name?
  711. static Ref
  712. read_qname(SerdReader* reader, Ref dest, bool read_prefix)
  713. {
  714. Ref str = 0;
  715. if (!dest) {
  716. dest = push_node(reader, SERD_CURIE, "", 0);
  717. }
  718. if (read_prefix) {
  719. read_prefixName(reader, dest);
  720. }
  721. TRY_THROW(eat_byte_check(reader, ':'));
  722. push_byte(reader, dest, ':');
  723. str = read_name(reader, dest);
  724. return str ? str : dest;
  725. except:
  726. return pop_node(reader, dest);
  727. }
  728. static bool
  729. read_0_9(SerdReader* reader, Ref str, bool at_least_one)
  730. {
  731. uint8_t c;
  732. if (at_least_one) {
  733. if (!is_digit((c = peek_byte(reader)))) {
  734. return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n");
  735. }
  736. push_byte(reader, str, eat_byte_safe(reader, c));
  737. }
  738. while (is_digit((c = peek_byte(reader)))) {
  739. push_byte(reader, str, eat_byte_safe(reader, c));
  740. }
  741. return true;
  742. }
  743. // [19] exponent ::= [eE] ('-' | '+')? [0-9]+
  744. // [18] decimal ::= ( '-' | '+' )? ( [0-9]+ '.' [0-9]*
  745. // | '.' ([0-9])+
  746. // | ([0-9])+ )
  747. // [17] double ::= ( '-' | '+' )? ( [0-9]+ '.' [0-9]* exponent
  748. // | '.' ([0-9])+ exponent
  749. // | ([0-9])+ exponent )
  750. // [16] integer ::= ( '-' | '+' ) ? [0-9]+
  751. static bool
  752. read_number(SerdReader* reader, Ref* dest, Ref* datatype)
  753. {
  754. #define XSD_DECIMAL NS_XSD "decimal"
  755. #define XSD_DOUBLE NS_XSD "double"
  756. #define XSD_INTEGER NS_XSD "integer"
  757. Ref ref = push_node(reader, SERD_LITERAL, "", 0);
  758. uint8_t c = peek_byte(reader);
  759. bool has_decimal = false;
  760. if (c == '-' || c == '+') {
  761. push_byte(reader, ref, eat_byte_safe(reader, c));
  762. }
  763. if ((c = peek_byte(reader)) == '.') {
  764. has_decimal = true;
  765. // decimal case 2 (e.g. '.0' or `-.0' or `+.0')
  766. push_byte(reader, ref, eat_byte_safe(reader, c));
  767. TRY_THROW(read_0_9(reader, ref, true));
  768. } else {
  769. // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
  770. assert(is_digit(c));
  771. read_0_9(reader, ref, true);
  772. if ((c = peek_byte(reader)) == '.') {
  773. has_decimal = true;
  774. push_byte(reader, ref, eat_byte_safe(reader, c));
  775. read_0_9(reader, ref, false);
  776. }
  777. }
  778. c = peek_byte(reader);
  779. if (c == 'e' || c == 'E') {
  780. // double
  781. push_byte(reader, ref, eat_byte_safe(reader, c));
  782. switch ((c = peek_byte(reader))) {
  783. case '+': case '-':
  784. push_byte(reader, ref, eat_byte_safe(reader, c));
  785. default: break;
  786. }
  787. read_0_9(reader, ref, true);
  788. *datatype = push_node(reader, SERD_URI,
  789. XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1);
  790. } else if (has_decimal) {
  791. *datatype = push_node(reader, SERD_URI,
  792. XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1);
  793. } else {
  794. *datatype = push_node(reader, SERD_URI,
  795. XSD_INTEGER, sizeof(XSD_INTEGER) - 1);
  796. }
  797. *dest = ref;
  798. return true;
  799. except:
  800. pop_node(reader, *datatype);
  801. pop_node(reader, ref);
  802. return false;
  803. }
  804. // [25] resource ::= uriref | qname
  805. static bool
  806. read_resource(SerdReader* reader, Ref* dest)
  807. {
  808. switch (peek_byte(reader)) {
  809. case '<':
  810. *dest = read_uriref(reader);
  811. break;
  812. default:
  813. *dest = read_qname(reader, 0, true);
  814. }
  815. return *dest != 0;
  816. }
  817. static bool
  818. read_literal(SerdReader* reader, Ref* dest,
  819. Ref* datatype, Ref* lang, SerdNodeFlags* flags)
  820. {
  821. Ref str = read_quotedString(reader, flags);
  822. if (!str) {
  823. return false;
  824. }
  825. switch (peek_byte(reader)) {
  826. case '^':
  827. eat_byte_safe(reader, '^');
  828. eat_byte_check(reader, '^');
  829. TRY_THROW(read_resource(reader, datatype));
  830. break;
  831. case '@':
  832. eat_byte_safe(reader, '@');
  833. TRY_THROW(*lang = read_language(reader));
  834. }
  835. *dest = str;
  836. return true;
  837. except:
  838. pop_node(reader, str);
  839. return false;
  840. }
  841. inline static bool
  842. is_token_end(uint8_t c)
  843. {
  844. switch (c) {
  845. case 0x9: case 0xA: case 0xD: case 0x20: case '\0':
  846. case '#': case '.': case ';': case '<':
  847. return true;
  848. default:
  849. return false;
  850. }
  851. }
  852. // [9] verb ::= predicate | 'a'
  853. static bool
  854. read_verb(SerdReader* reader, Ref* dest)
  855. {
  856. SerdNode* node;
  857. bool ret;
  858. switch (peek_byte(reader)) {
  859. case '<':
  860. ret = (*dest = read_uriref(reader));
  861. break;
  862. default:
  863. /* Either a qname, or "a". Read the prefix first, and if it is in fact
  864. "a", produce that instead.
  865. */
  866. *dest = read_prefixName(reader, 0);
  867. node = deref(reader, *dest);
  868. if (node && node->n_bytes == 1 && node->buf[0] == 'a'
  869. && is_token_end(peek_byte(reader))) {
  870. pop_node(reader, *dest);
  871. ret = (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47));
  872. } else {
  873. ret = (*dest = read_qname(reader, *dest, false));
  874. }
  875. }
  876. read_ws_star(reader);
  877. return ret;
  878. }
  879. // [26] nodeID ::= '_:' name
  880. static Ref
  881. read_nodeID(SerdReader* reader)
  882. {
  883. eat_byte_safe(reader, '_');
  884. eat_byte_check(reader, ':');
  885. Ref ref = push_node(reader, SERD_BLANK,
  886. reader->bprefix ? (char*)reader->bprefix : "",
  887. reader->bprefix_len);
  888. if (!read_name(reader, ref)) {
  889. return r_err(reader, SERD_ERR_BAD_SYNTAX,
  890. "invalid character at start of name\n");
  891. }
  892. if (reader->syntax == SERD_TURTLE) {
  893. const char* const buf = (const char*)deref(reader, ref)->buf;
  894. if (!strncmp(buf, "genid", 5)) {
  895. memcpy((char*)buf, "docid", 5); // Prevent clash
  896. reader->seen_genid = true;
  897. } else if (reader->seen_genid && !strncmp(buf, "docid", 5)) {
  898. r_err(reader, SERD_ERR_ID_CLASH,
  899. "found both `genid' and `docid' IDs, prefix required\n");
  900. return pop_node(reader, ref);
  901. }
  902. }
  903. return ref;
  904. }
  905. static void
  906. set_blank_id(SerdReader* reader, Ref ref, size_t buf_size)
  907. {
  908. SerdNode* node = deref(reader, ref);
  909. const char* prefix = reader->bprefix ? (const char*)reader->bprefix : "";
  910. node->n_bytes = node->n_chars = snprintf(
  911. (char*)node->buf, buf_size, "%sgenid%u", prefix, reader->next_id++);
  912. }
  913. static size_t
  914. genid_size(SerdReader* reader)
  915. {
  916. return reader->bprefix_len + 5 + 10 + 1; // + "genid" + UINT32_MAX + \0
  917. }
  918. static Ref
  919. blank_id(SerdReader* reader)
  920. {
  921. Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
  922. set_blank_id(reader, ref, genid_size(reader));
  923. return ref;
  924. }
  925. // Spec: [21] blank ::= nodeID | '[]'
  926. // | '[' predicateObjectList ']' | collection
  927. // Impl: [21] blank ::= nodeID | '[' ws* ']'
  928. // | '[' ws* predicateObjectList ws* ']' | collection
  929. static bool
  930. read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
  931. {
  932. const SerdStatementFlags old_flags = *ctx.flags;
  933. bool empty;
  934. switch (peek_byte(reader)) {
  935. case '_':
  936. return (*dest = read_nodeID(reader));
  937. case '[':
  938. eat_byte_safe(reader, '[');
  939. if ((empty = peek_delim(reader, ']'))) {
  940. *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O;
  941. } else {
  942. *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN;
  943. }
  944. *dest = blank_id(reader);
  945. if (ctx.subject) {
  946. TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
  947. }
  948. ctx.subject = *dest;
  949. if (!empty) {
  950. *ctx.flags &= ~(SERD_LIST_CONT);
  951. if (!subject) {
  952. *ctx.flags |= SERD_ANON_CONT;
  953. }
  954. read_predicateObjectList(reader, ctx);
  955. read_ws_star(reader);
  956. if (reader->end_sink) {
  957. reader->end_sink(reader->handle, deref(reader, *dest));
  958. }
  959. *ctx.flags = old_flags;
  960. }
  961. eat_byte_check(reader, ']');
  962. return true;
  963. case '(':
  964. return read_collection(reader, ctx, dest);
  965. default:
  966. return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid blank node\n");
  967. }
  968. }
  969. // [13] object ::= resource | blank | literal
  970. // Recurses, calling statement_sink for every statement encountered.
  971. // Leaves stack in original calling state (i.e. pops everything it pushes).
  972. static bool
  973. read_object(SerdReader* reader, ReadContext ctx)
  974. {
  975. static const char* const XSD_BOOLEAN = NS_XSD "boolean";
  976. static const size_t XSD_BOOLEAN_LEN = 40;
  977. #ifndef NDEBUG
  978. const size_t orig_stack_size = reader->stack.size;
  979. #endif
  980. bool ret = false;
  981. bool emit = (ctx.subject != 0);
  982. SerdNode* node = NULL;
  983. Ref o = 0;
  984. Ref datatype = 0;
  985. Ref lang = 0;
  986. uint32_t flags = 0;
  987. const uint8_t c = peek_byte(reader);
  988. switch (c) {
  989. case '\0':
  990. case ')':
  991. return false;
  992. case '[': case '(':
  993. emit = false;
  994. // fall through
  995. case '_':
  996. TRY_THROW(ret = read_blank(reader, ctx, false, &o));
  997. break;
  998. case '<': case ':':
  999. TRY_THROW(ret = read_resource(reader, &o));
  1000. break;
  1001. case '+': case '-': case '.': case '0': case '1': case '2': case '3':
  1002. case '4': case '5': case '6': case '7': case '8': case '9':
  1003. TRY_THROW(ret = read_number(reader, &o, &datatype));
  1004. break;
  1005. case '\"':
  1006. TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags));
  1007. break;
  1008. default:
  1009. /* Either a boolean literal, or a qname. Read the prefix first, and if
  1010. it is in fact a "true" or "false" literal, produce that instead.
  1011. */
  1012. o = read_prefixName(reader, 0);
  1013. node = deref(reader, o);
  1014. if (node && is_token_end(peek_byte(reader)) &&
  1015. ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4))
  1016. || (node->n_bytes == 5 && !memcmp(node->buf, "false", 5)))) {
  1017. node->type = SERD_LITERAL;
  1018. datatype = push_node(reader, SERD_URI,
  1019. XSD_BOOLEAN, XSD_BOOLEAN_LEN);
  1020. } else {
  1021. o = o ? o : push_node(reader, SERD_CURIE, "", 0);
  1022. o = read_qname(reader, o, false);
  1023. }
  1024. ret = o;
  1025. }
  1026. if (ret && emit) {
  1027. deref(reader, o)->flags = flags;
  1028. ret = emit_statement(reader, ctx, o, datatype, lang);
  1029. }
  1030. except:
  1031. pop_node(reader, lang);
  1032. pop_node(reader, datatype);
  1033. pop_node(reader, o);
  1034. #ifndef NDEBUG
  1035. assert(reader->stack.size == orig_stack_size);
  1036. #endif
  1037. return ret;
  1038. }
  1039. // Spec: [8] objectList ::= object ( ',' object )*
  1040. // Impl: [8] objectList ::= object ( ws* ',' ws* object )*
  1041. static bool
  1042. read_objectList(SerdReader* reader, ReadContext ctx)
  1043. {
  1044. TRY_RET(read_object(reader, ctx));
  1045. while (eat_delim(reader, ',')) {
  1046. TRY_RET(read_object(reader, ctx));
  1047. }
  1048. return true;
  1049. }
  1050. // Spec: [7] predicateObjectList ::= verb objectList
  1051. // (';' verb objectList)* (';')?
  1052. // Impl: [7] predicateObjectList ::= verb ws* objectList
  1053. // (ws* ';' ws* verb ws+ objectList)* (';')?
  1054. static bool
  1055. read_predicateObjectList(SerdReader* reader, ReadContext ctx)
  1056. {
  1057. TRY_RET(read_verb(reader, &ctx.predicate));
  1058. TRY_THROW(read_objectList(reader, ctx));
  1059. ctx.predicate = pop_node(reader, ctx.predicate);
  1060. while (eat_delim(reader, ';')) {
  1061. switch (peek_byte(reader)) {
  1062. case '.': case ']':
  1063. return true;
  1064. default:
  1065. TRY_THROW(read_verb(reader, &ctx.predicate));
  1066. TRY_THROW(read_objectList(reader, ctx));
  1067. ctx.predicate = pop_node(reader, ctx.predicate);
  1068. }
  1069. }
  1070. pop_node(reader, ctx.predicate);
  1071. return true;
  1072. except:
  1073. pop_node(reader, ctx.predicate);
  1074. return false;
  1075. }
  1076. static bool
  1077. end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret)
  1078. {
  1079. pop_node(reader, n2);
  1080. pop_node(reader, n1);
  1081. *ctx.flags &= ~SERD_LIST_CONT;
  1082. return ret && (eat_byte_safe(reader, ')') == ')');
  1083. }
  1084. // [22] itemList ::= object+
  1085. // [23] collection ::= '(' itemList? ')'
  1086. static bool
  1087. read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
  1088. {
  1089. eat_byte_safe(reader, '(');
  1090. bool end = peek_delim(reader, ')');
  1091. *dest = end ? reader->rdf_nil : blank_id(reader);
  1092. if (ctx.subject) {
  1093. // subject predicate _:head
  1094. *ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN);
  1095. TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
  1096. *ctx.flags |= SERD_LIST_CONT;
  1097. } else {
  1098. *ctx.flags |= (end ? 0 : SERD_LIST_S_BEGIN);
  1099. }
  1100. if (end) {
  1101. return end_collection(reader, ctx, 0, 0, true);
  1102. }
  1103. /* The order of node allocation here is necessarily not in stack order,
  1104. so we create two nodes and recycle them throughout. */
  1105. Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
  1106. Ref n2 = 0;
  1107. Ref node = n1;
  1108. Ref rest = 0;
  1109. ctx.subject = *dest;
  1110. while (!(end = peek_delim(reader, ')'))) {
  1111. // _:node rdf:first object
  1112. ctx.predicate = reader->rdf_first;
  1113. if (!read_object(reader, ctx)) {
  1114. return end_collection(reader, ctx, n1, n2, false);
  1115. }
  1116. if (!(end = peek_delim(reader, ')'))) {
  1117. /* Give rest a new ID. Done as late as possible to ensure it is
  1118. used and > IDs generated by read_object above. */
  1119. if (!rest) {
  1120. rest = n2 = blank_id(reader); // First pass, push a new node
  1121. } else {
  1122. set_blank_id(reader, rest, genid_size(reader));
  1123. }
  1124. }
  1125. // _:node rdf:rest _:rest
  1126. *ctx.flags |= SERD_LIST_CONT;
  1127. ctx.predicate = reader->rdf_rest;
  1128. TRY_RET(emit_statement(reader, ctx,
  1129. (end ? reader->rdf_nil : rest), 0, 0));
  1130. ctx.subject = rest; // _:node = _:rest
  1131. rest = node; // _:rest = (old)_:node
  1132. node = ctx.subject; // invariant
  1133. }
  1134. return end_collection(reader, ctx, n1, n2, true);
  1135. }
  1136. // [11] subject ::= resource | blank
  1137. static Ref
  1138. read_subject(SerdReader* reader, ReadContext ctx)
  1139. {
  1140. Ref subject = 0;
  1141. switch (peek_byte(reader)) {
  1142. case '[': case '(': case '_':
  1143. read_blank(reader, ctx, true, &subject);
  1144. break;
  1145. default:
  1146. read_resource(reader, &subject);
  1147. }
  1148. return subject;
  1149. }
  1150. // Spec: [6] triples ::= subject predicateObjectList
  1151. // Impl: [6] triples ::= subject ws+ predicateObjectList
  1152. static bool
  1153. read_triples(SerdReader* reader, ReadContext ctx)
  1154. {
  1155. const Ref subject = read_subject(reader, ctx);
  1156. bool ret = false;
  1157. if (subject) {
  1158. ctx.subject = subject;
  1159. TRY_RET(read_ws_plus(reader));
  1160. ret = read_predicateObjectList(reader, ctx);
  1161. pop_node(reader, subject);
  1162. }
  1163. ctx.subject = ctx.predicate = 0;
  1164. return ret;
  1165. }
  1166. // [5] base ::= '@base' ws+ uriref
  1167. static bool
  1168. read_base(SerdReader* reader)
  1169. {
  1170. // `@' is already eaten in read_directive
  1171. eat_string(reader, "base", 4);
  1172. TRY_RET(read_ws_plus(reader));
  1173. Ref uri;
  1174. TRY_RET(uri = read_uriref(reader));
  1175. if (reader->base_sink) {
  1176. reader->base_sink(reader->handle, deref(reader, uri));
  1177. }
  1178. pop_node(reader, uri);
  1179. return true;
  1180. }
  1181. // Spec: [4] prefixID ::= '@prefix' ws+ prefixName? ':' uriref
  1182. // Impl: [4] prefixID ::= '@prefix' ws+ prefixName? ':' ws* uriref
  1183. static bool
  1184. read_prefixID(SerdReader* reader)
  1185. {
  1186. bool ret = true;
  1187. Ref name = 0;
  1188. Ref uri = 0;
  1189. // `@' is already eaten in read_directive
  1190. eat_string(reader, "prefix", 6);
  1191. TRY_RET(read_ws_plus(reader));
  1192. name = read_prefixName(reader, 0);
  1193. if (!name) {
  1194. name = push_node(reader, SERD_LITERAL, "", 0);
  1195. }
  1196. TRY_THROW(eat_byte_check(reader, ':') == ':');
  1197. read_ws_star(reader);
  1198. TRY_THROW(uri = read_uriref(reader));
  1199. if (reader->prefix_sink) {
  1200. ret = !reader->prefix_sink(reader->handle,
  1201. deref(reader, name),
  1202. deref(reader, uri));
  1203. }
  1204. pop_node(reader, uri);
  1205. except:
  1206. pop_node(reader, name);
  1207. return ret;
  1208. }
  1209. // [3] directive ::= prefixID | base
  1210. static bool
  1211. read_directive(SerdReader* reader)
  1212. {
  1213. eat_byte_safe(reader, '@');
  1214. switch (peek_byte(reader)) {
  1215. case 'b': return read_base(reader);
  1216. case 'p': return read_prefixID(reader);
  1217. default: return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid directive\n");
  1218. }
  1219. }
  1220. // Spec: [1] statement ::= directive '.' | triples '.' | ws+
  1221. // Impl: [1] statement ::= directive ws* '.' | triples ws* '.' | ws+
  1222. static bool
  1223. read_statement(SerdReader* reader)
  1224. {
  1225. SerdStatementFlags flags = 0;
  1226. ReadContext ctx = { 0, 0, 0, &flags };
  1227. read_ws_star(reader);
  1228. switch (peek_byte(reader)) {
  1229. case '\0':
  1230. reader->eof = true;
  1231. return true;
  1232. case '@':
  1233. TRY_RET(read_directive(reader));
  1234. break;
  1235. default:
  1236. TRY_RET(read_triples(reader, ctx));
  1237. break;
  1238. }
  1239. read_ws_star(reader);
  1240. return eat_byte_check(reader, '.');
  1241. }
  1242. // [1] turtleDoc ::= (statement)*
  1243. static bool
  1244. read_turtleDoc(SerdReader* reader)
  1245. {
  1246. while (!reader->eof) {
  1247. TRY_RET(read_statement(reader));
  1248. }
  1249. return true;
  1250. }
  1251. SERD_API
  1252. SerdReader*
  1253. serd_reader_new(SerdSyntax syntax,
  1254. void* handle,
  1255. void (*free_handle)(void*),
  1256. SerdBaseSink base_sink,
  1257. SerdPrefixSink prefix_sink,
  1258. SerdStatementSink statement_sink,
  1259. SerdEndSink end_sink)
  1260. {
  1261. const Cursor cur = { NULL, 0, 0 };
  1262. SerdReader* me = (SerdReader*)malloc(sizeof(struct SerdReaderImpl));
  1263. me->handle = handle;
  1264. me->free_handle = free_handle;
  1265. me->base_sink = base_sink;
  1266. me->prefix_sink = prefix_sink;
  1267. me->statement_sink = statement_sink;
  1268. me->end_sink = end_sink;
  1269. me->error_sink = NULL;
  1270. me->error_handle = NULL;
  1271. me->default_graph = SERD_NODE_NULL;
  1272. me->fd = 0;
  1273. me->stack = serd_stack_new(SERD_PAGE_SIZE);
  1274. me->syntax = syntax;
  1275. me->cur = cur;
  1276. me->bprefix = NULL;
  1277. me->bprefix_len = 0;
  1278. me->next_id = 1;
  1279. me->read_buf = 0;
  1280. me->read_head = 0;
  1281. me->eof = false;
  1282. me->seen_genid = false;
  1283. #ifdef SERD_STACK_CHECK
  1284. me->allocs = 0;
  1285. me->n_allocs = 0;
  1286. #endif
  1287. me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48);
  1288. me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47);
  1289. me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46);
  1290. return me;
  1291. }
  1292. SERD_API
  1293. void
  1294. serd_reader_set_error_sink(SerdReader* reader,
  1295. SerdErrorSink error_sink,
  1296. void* error_handle)
  1297. {
  1298. reader->error_sink = error_sink;
  1299. reader->error_handle = error_handle;
  1300. }
  1301. SERD_API
  1302. void
  1303. serd_reader_free(SerdReader* reader)
  1304. {
  1305. pop_node(reader, reader->rdf_nil);
  1306. pop_node(reader, reader->rdf_rest);
  1307. pop_node(reader, reader->rdf_first);
  1308. serd_node_free(&reader->default_graph);
  1309. #ifdef SERD_STACK_CHECK
  1310. free(reader->allocs);
  1311. #endif
  1312. free(reader->stack.buf);
  1313. free(reader->bprefix);
  1314. if (reader->free_handle) {
  1315. reader->free_handle(reader->handle);
  1316. }
  1317. free(reader);
  1318. }
  1319. SERD_API
  1320. void*
  1321. serd_reader_get_handle(const SerdReader* reader)
  1322. {
  1323. return reader->handle;
  1324. }
  1325. SERD_API
  1326. void
  1327. serd_reader_add_blank_prefix(SerdReader* reader,
  1328. const uint8_t* prefix)
  1329. {
  1330. free(reader->bprefix);
  1331. reader->bprefix_len = 0;
  1332. reader->bprefix = NULL;
  1333. if (prefix) {
  1334. reader->bprefix_len = strlen((const char*)prefix);
  1335. reader->bprefix = (uint8_t*)malloc(reader->bprefix_len + 1);
  1336. memcpy(reader->bprefix, prefix, reader->bprefix_len + 1);
  1337. }
  1338. }
  1339. SERD_API
  1340. void
  1341. serd_reader_set_default_graph(SerdReader* reader,
  1342. const SerdNode* graph)
  1343. {
  1344. serd_node_free(&reader->default_graph);
  1345. reader->default_graph = serd_node_copy(graph);
  1346. }
  1347. SERD_API
  1348. SerdStatus
  1349. serd_reader_read_file(SerdReader* reader,
  1350. const uint8_t* uri)
  1351. {
  1352. const uint8_t* path = serd_uri_to_path(uri);
  1353. if (!path) {
  1354. return SERD_ERR_BAD_ARG;
  1355. }
  1356. FILE* fd = serd_fopen((const char*)path, "r");
  1357. if (!fd) {
  1358. return SERD_ERR_UNKNOWN;
  1359. }
  1360. SerdStatus ret = serd_reader_read_file_handle(reader, fd, path);
  1361. fclose(fd);
  1362. return ret;
  1363. }
  1364. static void
  1365. skip_bom(SerdReader* me)
  1366. {
  1367. const uint8_t* const b = me->read_buf;
  1368. if (me->paging && b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) {
  1369. me->read_head += 3;
  1370. }
  1371. }
  1372. SERD_API
  1373. SerdStatus
  1374. serd_reader_start_stream(SerdReader* me,
  1375. FILE* file,
  1376. const uint8_t* name,
  1377. bool bulk)
  1378. {
  1379. const Cursor cur = { name, 1, 1 };
  1380. me->fd = file;
  1381. me->read_head = 0;
  1382. me->cur = cur;
  1383. me->from_file = true;
  1384. me->eof = false;
  1385. me->paging = bulk;
  1386. if (bulk) {
  1387. me->read_buf = (uint8_t*)serd_bufalloc(SERD_PAGE_SIZE);
  1388. memset(me->read_buf, '\0', SERD_PAGE_SIZE);
  1389. SerdStatus st = page(me);
  1390. if (st) {
  1391. serd_reader_end_stream(me);
  1392. return st;
  1393. }
  1394. skip_bom(me);
  1395. } else {
  1396. me->read_buf = &me->read_byte;
  1397. me->read_byte = 0; // Don't read to avoid potentially blocking
  1398. }
  1399. return SERD_SUCCESS;
  1400. }
  1401. SERD_API
  1402. SerdStatus
  1403. serd_reader_read_chunk(SerdReader* me)
  1404. {
  1405. if (!me->read_byte) {
  1406. // Read initial byte
  1407. const int c = fgetc(me->fd);
  1408. me->read_byte = (c == EOF) ? 0 : (uint8_t)c;
  1409. if (c == EOF) {
  1410. me->eof = true;
  1411. return SERD_FAILURE;
  1412. }
  1413. }
  1414. return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE;
  1415. }
  1416. SERD_API
  1417. SerdStatus
  1418. serd_reader_end_stream(SerdReader* me)
  1419. {
  1420. if (me->paging) {
  1421. free(me->read_buf);
  1422. }
  1423. me->fd = 0;
  1424. me->read_buf = NULL;
  1425. return SERD_SUCCESS;
  1426. }
  1427. SERD_API
  1428. SerdStatus
  1429. serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name)
  1430. {
  1431. SerdStatus st = serd_reader_start_stream(me, file, name, true);
  1432. if (!st) {
  1433. st = read_turtleDoc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN;
  1434. serd_reader_end_stream(me);
  1435. }
  1436. return st;
  1437. }
  1438. SERD_API
  1439. SerdStatus
  1440. serd_reader_read_string(SerdReader* me, const uint8_t* utf8)
  1441. {
  1442. const Cursor cur = { (const uint8_t*)"(string)", 1, 1 };
  1443. me->read_buf = (uint8_t*)utf8;
  1444. me->read_head = 0;
  1445. me->cur = cur;
  1446. me->from_file = false;
  1447. me->paging = false;
  1448. me->eof = false;
  1449. skip_bom(me);
  1450. const bool ret = read_turtleDoc(me);
  1451. me->read_buf = NULL;
  1452. return ret ? SERD_SUCCESS : SERD_ERR_UNKNOWN;
  1453. }