SeqAn3  3.1.0-rc.1
The Modern C++ library for sequence analysis.
cigar.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <seqan3/std/charconv>
17 #include <seqan3/std/concepts>
18 #include <seqan3/std/ranges>
19 #include <sstream>
20 
29 
30 namespace seqan3::detail
31 {
34 struct view_equality_fn
35 {
37  template <std::ranges::forward_range rng1_type, std::ranges::forward_range rng2_type>
38  constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
39  {
40  return std::ranges::equal(rng1, rng2);
41  }
42 };
43 
83 template <typename reference_char_type, typename query_char_type>
84 [[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
85  query_char_type const query_char,
86  bool const extended_cigar)
88  requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap> &&
89  seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
91 {
92  constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
93  uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
94  if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
95  key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
96 
97  return assign_char_to(operators[key], cigar::operation{});
98 }
99 
107 inline void update_alignment_lengths(int32_t & ref_length,
108  int32_t & seq_length,
109  char const cigar_operation,
110  uint32_t const cigar_count)
111 {
112  switch (cigar_operation)
113  {
114  case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
115  case 'D': case 'N': ref_length += cigar_count; break;
116  case 'I' : seq_length += cigar_count; break;
117  case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
118  default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
119  }
120 }
121 
136 template <typename cigar_input_type>
137 inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
138 {
139  std::vector<cigar> operations{};
140  std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
141  char cigar_operation{};
142  uint32_t cigar_count{};
143  int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
144 
145  // transform input into a single input view if it isn't already
146  auto cigar_view = cigar_input | views::single_pass_input;
147 
148  // parse the rest of the cigar
149  // -------------------------------------------------------------------------------------------------------------
150  while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
151  {
152  auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
153  cigar_operation = *std::ranges::begin(cigar_view);
154  std::ranges::next(std::ranges::begin(cigar_view));
155 
156  if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
157  throw format_error{"Corrupted cigar string encountered"};
158 
159  update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
160  operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
161  }
162 
163  return {operations, ref_length, seq_length};
164 }
165 
203 template <seqan3::detail::pairwise_alignment alignment_type>
204 [[nodiscard]] inline std::vector<cigar> get_cigar_vector(alignment_type && alignment,
205  uint32_t const query_start_pos = 0,
206  uint32_t const query_end_pos = 0,
207  bool const extended_cigar = false)
208 {
209  using std::get;
210 
211  auto & ref_seq = get<0>(alignment);
212  auto & query_seq = get<1>(alignment);
213 
214  if (ref_seq.size() != query_seq.size())
215  throw std::logic_error{"The aligned sequences must have the same length."};
216 
217  std::vector<cigar> result{};
218 
219  if (!ref_seq.size())
220  return result; // return empty string if sequences are empty
221 
222  // Add (S)oft-clipping at the start of the read
223  if (query_start_pos)
224  result.emplace_back(query_start_pos, 'S'_cigar_operation);
225 
226  // Create cigar string from alignment
227  // -------------------------------------------------------------------------
228  // initialize first operation and count value:
229  cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
230  uint32_t count{0};
231 
232  // go through alignment columns
233  for (auto column : views::zip(ref_seq, query_seq))
234  {
235  cigar::operation next_op = map_aligned_values_to_cigar_op(std::get<0>(column),
236  std::get<1>(column),
237  extended_cigar);
238 
239  if (operation == next_op)
240  {
241  ++count;
242  }
243  else
244  {
245  result.emplace_back(count, operation);
246  operation = next_op;
247  count = 1;
248  }
249  }
250 
251  // append last cigar element
252  result.emplace_back(count, operation);
253 
254  // Add (S)oft-clipping at the end of the read
255  if (query_end_pos)
256  result.emplace_back(query_end_pos, 'S'_cigar_operation);
257 
258  return result;
259 }
260 
266 [[nodiscard]] inline std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
267 {
268  std::string result{};
269  std::ranges::for_each(cigar_vector, [&result] (auto & cig) { result.append(cig.to_string()); });
270  return result;
271 }
272 
306 template <seqan3::detail::pairwise_alignment alignment_type>
307 [[nodiscard]] inline std::string get_cigar_string(alignment_type && alignment,
308  uint32_t const query_start_pos = 0,
309  uint32_t const query_end_pos = 0,
310  bool const extended_cigar = false)
311 {
312  return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
313 }
314 
348 template <seqan3::aligned_sequence ref_seq_type, seqan3::aligned_sequence query_seq_type>
349 [[nodiscard]] inline std::string get_cigar_string(ref_seq_type && ref_seq,
350  query_seq_type && query_seq,
351  uint32_t const query_start_pos = 0,
352  uint32_t const query_end_pos = 0,
353  bool const extended_cigar = false)
354 {
355  return get_cigar_string(std::tie(ref_seq, query_seq), query_start_pos, query_end_pos, extended_cigar);
356 }
357 
380 template <seqan3::detail::writable_pairwise_alignment alignment_type>
381 inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar> const & cigar_vector)
382 {
383  using std::get;
384  auto current_ref_pos = std::ranges::begin(get<0>(alignment));
385  auto current_read_pos = std::ranges::begin(get<1>(alignment));
386 
387  for (auto [cigar_count, cigar_operation] : cigar_vector)
388  {
389  // ignore since alignment shall contain sliced sequences
390  if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
391  continue;
392 
393  assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
394  ('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation) ||
395  ('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation) ||
396  ('P'_cigar_operation == cigar_operation)); // checked during IO
397 
398  if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
399  ('X'_cigar_operation == cigar_operation))
400  {
401  std::ranges::advance(current_ref_pos , cigar_count);
402  std::ranges::advance(current_read_pos, cigar_count);
403  }
404  else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
405  {
406  // insert gaps into read
407 
408  assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
409  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
410  ++current_read_pos;
411  std::ranges::advance(current_ref_pos , cigar_count);
412  }
413  else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
414  {
415  assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
416  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
417  ++current_ref_pos;
418  std::ranges::advance(current_read_pos, cigar_count);
419  }
420  else if (('P'_cigar_operation == cigar_operation)) // skip padding
421  {
422  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
423  ++current_ref_pos;
424 
425  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
426  ++current_read_pos;
427  }
428  }
429 }
430 
433 struct access_restrictor_fn
434 {
436  template <typename chr_t>
437  [[noreturn]] chr_t operator()(chr_t) const
438  {
439  throw std::logic_error{"Access is not allowed because there is no sequence information."};
440  }
441 };
442 
443 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Provides the seqan3::cigar alphabet.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
The Concepts library.
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:523
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:269
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:361
constexpr auto zip
A zip view.
Definition: zip.hpp:29
constexpr auto const & get(configuration< configs_t... > const &config) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: configuration.hpp:429
Provides seqan3::detail::pairwise_alignment and seqan3::detail::writable_pairwise_alignment.
Provides character predicates for tokenisation.
Adaptations of concepts from the Ranges TS.
Provides seqan3::views::single_pass_input.
Provides std::from_chars and std::to_chars if not defined in the stdlib <charconv> header.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Auxiliary for pretty printing of exception messages.
Provides seqan3::tuple_like.
Provides seqan3::views::zip.