SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <iterator>
16#include <ranges>
17#include <string>
18#include <vector>
19
38
39namespace seqan3
40{
41
113{
114public:
118 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
119 format_sam() = default;
120 format_sam(format_sam const &) = default;
121 format_sam & operator=(format_sam const &) = default;
122 format_sam(format_sam &&) = default;
124 ~format_sam() = default;
125
127
130 {"sam"},
131 };
132
133protected:
134 template <typename stream_type, // constraints checked by file
135 typename seq_legal_alph_type,
136 typename stream_pos_type,
137 typename seq_type, // other constraints checked inside function
138 typename id_type,
139 typename qual_type>
140 void read_sequence_record(stream_type & stream,
142 stream_pos_type & position_buffer,
143 seq_type & sequence,
144 id_type & id,
145 qual_type & qualities);
146
147 template <typename stream_type, // constraints checked by file
148 typename seq_type, // other constraints checked inside function
149 typename id_type,
150 typename qual_type>
151 void write_sequence_record(stream_type & stream,
152 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
153 seq_type && sequence,
154 id_type && id,
155 qual_type && qualities);
156
157 template <typename stream_type, // constraints checked by file
158 typename seq_legal_alph_type,
159 typename ref_seqs_type,
160 typename ref_ids_type,
161 typename stream_pos_type,
162 typename seq_type,
163 typename id_type,
164 typename offset_type,
165 typename ref_seq_type,
166 typename ref_id_type,
167 typename ref_offset_type,
168 typename cigar_type,
169 typename flag_type,
170 typename mapq_type,
171 typename qual_type,
172 typename mate_type,
173 typename tag_dict_type,
174 typename e_value_type,
175 typename bit_score_type>
176 void read_alignment_record(stream_type & stream,
177 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
178 ref_seqs_type & ref_seqs,
180 stream_pos_type & position_buffer,
181 seq_type & seq,
182 qual_type & qual,
183 id_type & id,
184 offset_type & offset,
185 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
186 ref_id_type & ref_id,
187 ref_offset_type & ref_offset,
188 cigar_type & cigar_vector,
189 flag_type & flag,
190 mapq_type & mapq,
191 mate_type & mate,
192 tag_dict_type & tag_dict,
193 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
194 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
195
196 template <typename stream_type,
197 typename header_type,
198 typename seq_type,
199 typename id_type,
200 typename ref_seq_type,
201 typename ref_id_type,
202 typename qual_type,
203 typename mate_type,
204 typename tag_dict_type,
205 typename e_value_type,
206 typename bit_score_type>
207 void write_alignment_record(stream_type & stream,
208 sam_file_output_options const & options,
209 header_type && header,
210 seq_type && seq,
211 qual_type && qual,
212 id_type && id,
213 int32_t const offset,
214 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
215 ref_id_type && ref_id,
216 std::optional<int32_t> ref_offset,
217 std::vector<cigar> const & cigar_vector,
218 sam_flag const flag,
219 uint8_t const mapq,
220 mate_type && mate,
221 tag_dict_type && tag_dict,
222 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
223 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
224
225private:
228
230 static constexpr std::string_view dummy{};
231
234
237
240 {
241 return dummy;
242 }
243
245 template <typename t>
246 decltype(auto) default_or(t && v) const noexcept
247 {
248 return std::forward<t>(v);
249 }
250
251 template <typename stream_view_type, arithmetic value_type>
252 void
253 read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view, value_type value);
254
255 template <typename stream_view_type>
256 void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view);
257
258 template <typename stream_view_type>
259 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
260
261 template <typename stream_it_t, std::ranges::forward_range field_type>
262 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
263
264 template <typename stream_it_t>
265 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
266
267 template <typename stream_it_t>
268 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
269};
270
272template <typename stream_type, // constraints checked by file
273 typename seq_legal_alph_type,
274 typename stream_pos_type,
275 typename seq_type, // other constraints checked inside function
276 typename id_type,
277 typename qual_type>
278inline void format_sam::read_sequence_record(stream_type & stream,
280 stream_pos_type & position_buffer,
281 seq_type & sequence,
282 id_type & id,
283 qual_type & qualities)
284{
286
287 {
289 align_options,
290 std::ignore,
292 position_buffer,
293 sequence,
294 qualities,
295 id,
296 std::ignore,
297 std::ignore,
298 std::ignore,
299 std::ignore,
300 std::ignore,
301 std::ignore,
302 std::ignore,
303 std::ignore,
304 std::ignore,
305 std::ignore,
306 std::ignore);
307 }
308
309 if constexpr (!detail::decays_to_ignore_v<seq_type>)
310 if (std::ranges::distance(sequence) == 0)
311 throw parse_error{"The sequence information must not be empty."};
312 if constexpr (!detail::decays_to_ignore_v<id_type>)
313 if (std::ranges::distance(id) == 0)
314 throw parse_error{"The id information must not be empty."};
315
316 if (options.truncate_ids)
317 id = id | detail::take_until_and_consume(is_space) | ranges::to<id_type>();
318}
319
321template <typename stream_type, // constraints checked by file
322 typename seq_type, // other constraints checked inside function
323 typename id_type,
324 typename qual_type>
325inline void format_sam::write_sequence_record(stream_type & stream,
326 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
327 seq_type && sequence,
328 id_type && id,
329 qual_type && qualities)
330{
331 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
332
333 sam_file_output_options output_options;
334
336 output_options,
337 /*header*/ std::ignore,
338 /*seq*/ default_or(sequence),
339 /*qual*/ default_or(qualities),
340 /*id*/ default_or(id),
341 /*offset*/ 0,
342 /*ref_seq*/ std::string_view{},
343 /*ref_id*/ std::string_view{},
344 /*ref_offset*/ -1,
345 /*cigar_vector*/ std::vector<cigar>{},
346 /*flag*/ sam_flag::none,
347 /*mapq*/ 0,
348 /*mate*/ default_mate_t{},
349 /*tag_dict*/ sam_tag_dictionary{},
350 /*e_value*/ 0,
351 /*bit_score*/ 0);
352}
353
355template <typename stream_type, // constraints checked by file
356 typename seq_legal_alph_type,
357 typename ref_seqs_type,
358 typename ref_ids_type,
359 typename stream_pos_type,
360 typename seq_type,
361 typename id_type,
362 typename offset_type,
363 typename ref_seq_type,
364 typename ref_id_type,
365 typename ref_offset_type,
366 typename cigar_type,
367 typename flag_type,
368 typename mapq_type,
369 typename qual_type,
370 typename mate_type,
371 typename tag_dict_type,
372 typename e_value_type,
373 typename bit_score_type>
374inline void
376 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
377 ref_seqs_type & ref_seqs,
379 stream_pos_type & position_buffer,
380 seq_type & seq,
381 qual_type & qual,
382 id_type & id,
383 offset_type & offset,
384 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
385 ref_id_type & ref_id,
386 ref_offset_type & ref_offset,
387 cigar_type & cigar_vector,
388 flag_type & flag,
389 mapq_type & mapq,
390 mate_type & mate,
391 tag_dict_type & tag_dict,
392 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
393 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
394{
395 static_assert(detail::decays_to_ignore_v<ref_offset_type>
396 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
397 "The ref_offset must be a specialisation of std::optional.");
398
399 auto stream_view = detail::istreambuf(stream);
400 auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
401
402 int32_t ref_offset_tmp{}; // needed to read the ref_offset (int) beofre storing it in std::optional<uint32_t>
403 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{}; // in case mate is requested but ref_offset not
404
405 // Header
406 // -------------------------------------------------------------------------------------------------------------
407 if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
408 {
409 read_header(stream_view, header, ref_seqs);
410
411 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
412 return;
413 }
414
415 // Store the current file position in the buffer.
416 position_buffer = stream.tellg();
417
418 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
419 // -------------------------------------------------------------------------------------------------------------
420 if constexpr (!detail::decays_to_ignore_v<id_type>)
421 read_forward_range_field(field_view, id);
422 else
423 detail::consume(field_view);
424
425 uint16_t flag_integral{};
426 read_arithmetic_field(field_view, flag_integral);
427 flag = sam_flag{flag_integral};
428
429 read_forward_range_field(field_view, ref_id_tmp);
430 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
431
432 read_arithmetic_field(field_view, ref_offset_tmp);
433 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
434
435 if (ref_offset_tmp == -1)
436 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
437 else if (ref_offset_tmp > -1)
438 ref_offset = ref_offset_tmp;
439 else if (ref_offset_tmp < -1)
440 throw format_error{"No negative values are allowed for field::ref_offset."};
441
442 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
443 read_arithmetic_field(field_view, mapq);
444 else
445 detail::consume(field_view);
446
447 // Field 6: CIGAR
448 // -------------------------------------------------------------------------------------------------------------
449 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
450 {
451 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
452 {
453 int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
454 std::tie(cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
455 int32_t soft_clipping_end{};
456 int32_t offset_tmp{};
457 transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
458 offset = offset_tmp;
459 }
460 else
461 {
462 ++std::ranges::begin(field_view); // skip '*'
463 }
464 }
465 else
466 {
467 detail::consume(field_view);
468 }
469
470 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
471 // -------------------------------------------------------------------------------------------------------------
472 if constexpr (!detail::decays_to_ignore_v<mate_type>)
473 {
474 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
475 read_forward_range_field(field_view, tmp_mate_ref_id); // RNEXT
476
477 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
478 {
479 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
480 get<0>(mate) = ref_id;
481 else
482 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
483 }
484 else
485 {
486 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
487 }
488
489 int32_t tmp_pnext{};
490 read_arithmetic_field(field_view, tmp_pnext); // PNEXT
491
492 if (tmp_pnext > 0)
493 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
494 else if (tmp_pnext < 0)
495 throw format_error{"No negative values are allowed at the mate mapping position."};
496 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
497
498 read_arithmetic_field(field_view, get<2>(mate)); // TLEN
499 }
500 else
501 {
502 for (size_t i = 0; i < 3u; ++i)
503 {
504 detail::consume(field_view);
505 }
506 }
507
508 // Field 10: Sequence
509 // -------------------------------------------------------------------------------------------------------------
510 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
511 {
512 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
513 auto seq_stream =
514 field_view
516 [is_legal_alph](char const c) // enforce legal alphabet
517 {
518 if (!is_legal_alph(c))
519 throw parse_error{std::string{"Encountered an unexpected letter: "} + "char_is_valid_for<"
520 + detail::type_name_as_string<seq_legal_alph_type>
521 + "> evaluated to false on " + detail::make_printable(c)};
522 return c;
523 });
524
525 if constexpr (detail::decays_to_ignore_v<seq_type>)
526 {
527 detail::consume(seq_stream);
528 }
529 else
530 {
531 read_forward_range_field(seq_stream, seq);
532 }
533 }
534 else
535 {
536 ++std::ranges::begin(field_view); // skip '*'
537 }
538
539 // Field 11: Quality
540 // -------------------------------------------------------------------------------------------------------------
541 auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
542 auto qual_view = stream_view | detail::take_until_or_throw(tab_or_end);
543 if constexpr (!detail::decays_to_ignore_v<qual_type>)
544 read_forward_range_field(qual_view, qual);
545 else
546 detail::consume(qual_view);
547
548 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
549 {
550 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0
551 && std::ranges::distance(seq) != std::ranges::distance(qual))
552 {
553 throw format_error{detail::to_string("Sequence length (",
554 std::ranges::distance(seq),
555 ") and quality length (",
556 std::ranges::distance(qual),
557 ") must be the same.")};
558 }
559 }
560
561 // All remaining optional fields if any: SAM tags dictionary
562 // -------------------------------------------------------------------------------------------------------------
563 while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
564 {
565 ++std::ranges::begin(stream_view); // skip tab
566 auto stream_until_tab_or_end = stream_view | detail::take_until_or_throw(tab_or_end);
567 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
568 read_sam_dict_field(stream_until_tab_or_end, tag_dict);
569 else
570 detail::consume(stream_until_tab_or_end);
571 }
572
573 detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
574}
575
577template <typename stream_type,
578 typename header_type,
579 typename seq_type,
580 typename id_type,
581 typename ref_seq_type,
582 typename ref_id_type,
583 typename qual_type,
584 typename mate_type,
585 typename tag_dict_type,
586 typename e_value_type,
587 typename bit_score_type>
588inline void format_sam::write_alignment_record(stream_type & stream,
589 sam_file_output_options const & options,
590 header_type && header,
591 seq_type && seq,
592 qual_type && qual,
593 id_type && id,
594 int32_t const offset,
595 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
596 ref_id_type && ref_id,
597 std::optional<int32_t> ref_offset,
598 std::vector<cigar> const & cigar_vector,
599 sam_flag const flag,
600 uint8_t const mapq,
601 mate_type && mate,
602 tag_dict_type && tag_dict,
603 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
604 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
605{
606 /* Note the following general things:
607 *
608 * - Given the SAM specifications, all fields may be empty
609 *
610 * - arithmetic values default to 0 while all others default to '*'
611 *
612 * - Because of the former, arithmetic values can be directly streamed
613 * into 'stream' as operator<< is defined for all arithmetic types
614 * and the default value (0) is also the SAM default.
615 *
616 * - All other non-arithmetic values need to be checked for emptiness
617 */
618
619 // ---------------------------------------------------------------------
620 // Type Requirements (as static asserts for user friendliness)
621 // ---------------------------------------------------------------------
622 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
623 "The seq object must be a std::ranges::forward_range over "
624 "letters that model seqan3::alphabet.");
625
626 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
627 "The id object must be a std::ranges::forward_range over "
628 "letters that model seqan3::alphabet.");
629
630 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
631 {
632 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
633 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
634 "The ref_id object must be a std::ranges::forward_range "
635 "over letters that model seqan3::alphabet.");
636
637 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
638 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
639 static_assert(!detail::decays_to_ignore_v<header_type>,
640 "If you give indices as reference id information the header must also be present.");
641 }
642
643 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
644 "The qual object must be a std::ranges::forward_range "
645 "over letters that model seqan3::alphabet.");
646
648 "The mate object must be a std::tuple of size 3 with "
649 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
650 "2) a std::integral or std::optional<std::integral>, and "
651 "3) a std::integral.");
652
653 static_assert(
654 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
655 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
656 || detail::is_type_specialisation_of_v<
657 std::remove_cvref_t<decltype(std::get<0>(mate))>,
658 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
659 || detail::is_type_specialisation_of_v<
660 std::remove_cvref_t<decltype(std::get<1>(mate))>,
661 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
662 "The mate object must be a std::tuple of size 3 with "
663 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
664 "2) a std::integral or std::optional<std::integral>, and "
665 "3) a std::integral.");
666
667 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
668 || detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>,
670 static_assert(!detail::decays_to_ignore_v<header_type>,
671 "If you give indices as mate reference id information the header must also be present.");
672
673 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
674 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
675
676 // ---------------------------------------------------------------------
677 // logical Requirements
678 // ---------------------------------------------------------------------
679 if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
680 && !std::integral<std::remove_reference_t<ref_id_type>>
681 && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
682 {
683
684 if (options.sam_require_header && !std::ranges::empty(ref_id))
685 {
686 auto id_it = header.ref_dict.end();
687
688 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> && std::ranges::sized_range<decltype(ref_id)>
689 && std::ranges::borrowed_range<decltype(ref_id)>)
690 {
691 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
692 }
693 else
694 {
695 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
696
698 "The ref_id type is not convertible to the reference id information stored in the "
699 "reference dictionary of the header object.");
700
701 id_it = header.ref_dict.find(ref_id);
702 }
703
704 if (id_it == header.ref_dict.end()) // no reference id matched
705 throw format_error{detail::to_string("The ref_id '",
706 ref_id,
707 "' was not in the list of references:",
708 header.ref_ids())};
709 }
710 }
711
712 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
713 throw format_error{"The ref_offset object must be a std::integral >= 0."};
714
715 // ---------------------------------------------------------------------
716 // Writing the Header on first call
717 // ---------------------------------------------------------------------
718 if constexpr (!detail::decays_to_ignore_v<header_type>)
719 {
721 {
722 write_header(stream, options, header);
723 header_was_written = true;
724 }
725 }
726
727 // ---------------------------------------------------------------------
728 // Writing the Record
729 // ---------------------------------------------------------------------
730
731 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
732 constexpr char separator{'\t'};
733
734 write_range_or_asterisk(stream_it, id);
735 *stream_it = separator;
736
737 stream_it.write_number(static_cast<uint16_t>(flag));
738 *stream_it = separator;
739
740 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
741 {
742 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
743 {
744 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
745 }
746 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
747 {
748 if (ref_id.has_value())
749 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
750 else
751 *stream_it = '*';
752 }
753 else
754 {
756 }
757 }
758 else
759 {
760 *stream_it = '*';
761 }
762
763 *stream_it = separator;
764
765 // SAM is 1 based, 0 indicates unmapped read if optional is not set
766 stream_it.write_number(ref_offset.value_or(-1) + 1);
767 *stream_it = separator;
768
769 stream_it.write_number(static_cast<unsigned>(mapq));
770 *stream_it = separator;
771
772 if (!std::ranges::empty(cigar_vector))
773 {
774 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
775 stream_it.write_range(c.to_string());
776 }
777 else
778 {
779 *stream_it = '*';
780 }
781
782 *stream_it = separator;
783
784 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
785 {
786 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
787 }
788 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>,
790 {
791 if (get<0>(mate).has_value())
792 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value()]);
793 else
794 *stream_it = '*';
795 }
796 else
797 {
798 write_range_or_asterisk(stream_it, get<0>(mate));
799 }
800
801 *stream_it = separator;
802
803 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
804 {
805 // SAM is 1 based, 0 indicates unmapped read if optional is not set
806 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
807 *stream_it = separator;
808 }
809 else
810 {
811 stream_it.write_number(get<1>(mate));
812 *stream_it = separator;
813 }
814
815 stream_it.write_number(get<2>(mate));
816 *stream_it = separator;
817
818 write_range_or_asterisk(stream_it, seq);
819 *stream_it = separator;
820
821 write_range_or_asterisk(stream_it, qual);
822
823 write_tag_fields(stream_it, tag_dict, separator);
824
825 stream_it.write_end_of_line(options.add_carriage_return);
826}
827
845template <typename stream_view_type, arithmetic value_type>
847 stream_view_type && stream_view,
848 value_type value)
849{
850 std::vector<value_type> tmp_vector;
851 while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
852 {
853 read_arithmetic_field(stream_view | detail::take_until(is_char<','>), value);
854 tmp_vector.push_back(value);
855
856 if (is_char<','>(*std::ranges::begin(stream_view)))
857 ++std::ranges::begin(stream_view); // skip ','
858 }
859 variant = std::move(tmp_vector);
860}
861
875template <typename stream_view_type>
876inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view)
877{
878 std::vector<std::byte> tmp_vector;
879 std::byte value;
880
881 while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
882 {
883 try
884 {
885 read_byte_field(stream_view | detail::take_exactly_or_throw(2), value);
886 }
887 catch (std::exception const & e)
888 {
889 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
890 }
891
892 tmp_vector.push_back(value);
893 }
894
895 variant = std::move(tmp_vector);
896}
897
915template <typename stream_view_type>
916inline void format_sam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
917{
918 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
919 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
920 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
921 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
922 */
923 uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
924 ++std::ranges::begin(stream_view); // skip char read before
925 tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
926 ++std::ranges::begin(stream_view); // skip char read before
927 ++std::ranges::begin(stream_view); // skip ':'
928 char type_id = *std::ranges::begin(stream_view);
929 ++std::ranges::begin(stream_view); // skip char read before
930 ++std::ranges::begin(stream_view); // skip ':'
931
932 switch (type_id)
933 {
934 case 'A': // char
935 {
936 target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
937 ++std::ranges::begin(stream_view); // skip char that has been read
938 break;
939 }
940 case 'i': // int32_t
941 {
942 int32_t tmp;
943 read_arithmetic_field(stream_view, tmp);
944 target[tag] = tmp;
945 break;
946 }
947 case 'f': // float
948 {
949 float tmp;
950 read_arithmetic_field(stream_view, tmp);
951 target[tag] = tmp;
952 break;
953 }
954 case 'Z': // string
955 {
956 target[tag] = stream_view | ranges::to<std::string>();
957 break;
958 }
959 case 'H':
960 {
961 read_sam_byte_vector(target[tag], stream_view);
962 break;
963 }
964 case 'B': // Array. Value type depends on second char [cCsSiIf]
965 {
966 char array_value_type_id = *std::ranges::begin(stream_view);
967 ++std::ranges::begin(stream_view); // skip char read before
968 ++std::ranges::begin(stream_view); // skip first ','
969
970 switch (array_value_type_id)
971 {
972 case 'c': // int8_t
973 read_sam_dict_vector(target[tag], stream_view, int8_t{});
974 break;
975 case 'C': // uint8_t
976 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
977 break;
978 case 's': // int16_t
979 read_sam_dict_vector(target[tag], stream_view, int16_t{});
980 break;
981 case 'S': // uint16_t
982 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
983 break;
984 case 'i': // int32_t
985 read_sam_dict_vector(target[tag], stream_view, int32_t{});
986 break;
987 case 'I': // uint32_t
988 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
989 break;
990 case 'f': // float
991 read_sam_dict_vector(target[tag], stream_view, float{});
992 break;
993 default:
994 throw format_error{std::string("The first character in the numerical ")
995 + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
996 + "' was given."};
997 }
998 break;
999 }
1000 default:
1001 throw format_error{std::string("The second character in the numerical id of a "
1002 "SAM tag must be one of [A,i,Z,H,B,f] but '")
1003 + type_id + "' was given."};
1004 }
1005}
1006
1014template <typename stream_it_t, std::ranges::forward_range field_type>
1015inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1016{
1017 if (std::ranges::empty(field_value))
1018 {
1019 *stream_it = '*';
1020 }
1021 else
1022 {
1023 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1024 stream_it.write_range(field_value);
1025 else // convert from alphabets to their character representation
1026 stream_it.write_range(field_value | views::to_char);
1027 }
1028}
1029
1036template <typename stream_it_t>
1037inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1038{
1039 write_range_or_asterisk(stream_it, std::string_view{field_value});
1040}
1041
1049template <typename stream_it_t>
1050inline void
1051format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1052{
1053 auto const stream_variant_fn = [&stream_it](auto && arg) // helper to print a std::variant
1054 {
1055 using T = std::remove_cvref_t<decltype(arg)>;
1056
1057 if constexpr (std::ranges::input_range<T>)
1058 {
1059 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1060 {
1061 stream_it.write_range(arg);
1062 }
1063 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1064 {
1065 if (!std::ranges::empty(arg))
1066 {
1067 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1068
1069 for (auto && elem : arg | std::views::drop(1))
1070 {
1071 *stream_it = ',';
1072 stream_it.write_number(std::to_integer<uint8_t>(elem));
1073 }
1074 }
1075 }
1076 else
1077 {
1078 if (!std::ranges::empty(arg))
1079 {
1080 stream_it.write_number(*std::ranges::begin(arg));
1081
1082 for (auto && elem : arg | std::views::drop(1))
1083 {
1084 *stream_it = ',';
1085 stream_it.write_number(elem);
1086 }
1087 }
1088 }
1089 }
1090 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1091 {
1092 *stream_it = arg;
1093 }
1094 else // number
1095 {
1096 stream_it.write_number(arg);
1097 }
1098 };
1099
1100 for (auto & [tag, variant] : tag_dict)
1101 {
1102 *stream_it = separator;
1103
1104 char const char0 = tag / 256;
1105 char const char1 = tag % 256;
1106
1107 *stream_it = char0;
1108 *stream_it = char1;
1109 *stream_it = ':';
1110 *stream_it = detail::sam_tag_type_char[variant.index()];
1111 *stream_it = ':';
1112
1113 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1114 {
1115 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1116 *stream_it = ',';
1117 }
1118
1119 std::visit(stream_variant_fn, variant);
1120 }
1121}
1122
1123} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The alignment base format.
Definition: format_sam_base.hpp:45
void check_and_assign_ref_id(ref_id_type &ref_id, ref_id_tmp_type &ref_id_tmp, header_type &header, ref_seqs_type &)
Checks for known reference ids or adds a new reference is and assigns a reference id to ref_id.
Definition: format_sam_base.hpp:109
void read_arithmetic_field(stream_view_t &&stream_view, arithmetic_target_type &arithmetic_target)
Reads arithmetic fields using std::from_chars.
Definition: format_sam_base.hpp:264
void write_header(stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:636
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:151
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition: format_sam_base.hpp:232
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:66
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:300
void read_byte_field(stream_view_t &&stream_view, std::byte &byte_target)
Reads std::byte fields using std::from_chars.
Definition: format_sam_base.hpp:204
The SAM format (tag).
Definition: format_sam.hpp:113
sam_file_header default_header
The default header for the alignment format.
Definition: format_sam.hpp:233
std::string_view const & default_or(detail::ignore_t) const noexcept
brief Returns a reference to dummy if passed a std::ignore.
Definition: format_sam.hpp:239
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:588
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=default
Defaulted.
void read_sam_byte_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view)
Reads a list of byte pairs as it is the case for SAM tag byte arrays.
Definition: format_sam.hpp:876
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:129
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:325
format_sam(format_sam &&)=default
Defaulted.
void write_tag_fields(stream_it_t &stream, sam_tag_dictionary const &tag_dict, char const separator)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_sam.hpp:1051
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_sam.hpp:846
void read_sam_dict_field(stream_view_type &&stream_view, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition: format_sam.hpp:916
static constexpr std::string_view dummy
An empty dummy container to pass to align_format.write() such that an empty field is written.
Definition: format_sam.hpp:230
void write_range_or_asterisk(stream_it_t &stream_it, field_type &&field_value)
Writes a field value to the stream.
Definition: format_sam.hpp:1015
bool ref_info_present_in_header
Tracks whether reference information (@SR tag) were found in the SAM header.
Definition: format_sam.hpp:236
decltype(auto) default_or(t &&v) const noexcept
brief Returns the input unchanged.
Definition: format_sam.hpp:246
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:278
std::string tmp_qual
Stores quality values temporarily if seq and qual information are combined (not supported by SAM yet)...
Definition: format_sam.hpp:227
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:375
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:148
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
@ none
None of the flags below are set.
constexpr auto take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until_view.hpp:560
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto take_until_or_throw_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until_view.hpp:602
constexpr auto take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until_view.hpp:588
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until_view.hpp:574
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
std::string make_printable(char const c)
Returns a printable value for the given character c.
Definition: pretty_print.hpp:48
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:125
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: type_list/traits.hpp:395
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
Provides the seqan3::sam_file_header class.
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: io/exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: sam_file/output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: sam_file/output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: sequence_file/input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/output_options.hpp:26
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::ranges::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)