SeqAn3  3.1.0-rc.1
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <cstdio>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <stack>
20 #include <string>
21 #include <string_view>
22 #include <type_traits>
23 #include <vector>
24 
44 
45 namespace seqan3
46 {
84 {
85 public:
89  format_vienna() noexcept = default;
90  format_vienna(format_vienna const &) noexcept = default;
91  format_vienna & operator=(format_vienna const &) noexcept = default;
92  format_vienna(format_vienna &&) noexcept = default;
93  format_vienna & operator=(format_vienna &&) noexcept = default;
94  ~format_vienna() noexcept = default;
95 
97 
99  static inline std::vector<std::string> file_extensions
100  {
101  { "dbn" },
102  { "fasta" },
103  { "fa" }
104  };
105 
106 protected:
108  template <typename stream_type, // constraints checked by file
109  typename seq_legal_alph_type,
110  bool structured_seq_combined,
111  typename seq_type, // other constraints checked inside function
112  typename id_type,
113  typename bpp_type,
114  typename structure_type,
115  typename energy_type,
116  typename react_type,
117  typename comment_type,
118  typename offset_type>
119  void read_structure_record(stream_type & stream,
121  seq_type & seq,
122  id_type & id,
123  bpp_type & bpp,
124  structure_type & structure,
125  energy_type & energy,
126  react_type & SEQAN3_DOXYGEN_ONLY(react),
127  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
128  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
129  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
130  {
131  auto stream_view = detail::istreambuf(stream);
132 
133  // READ ID (if present)
134  auto constexpr is_id = is_char<'>'>;
135  if (is_id(*begin(stream_view)))
136  {
137  if constexpr (!detail::decays_to_ignore_v<id_type>)
138  {
139  if (options.truncate_ids)
140  {
141  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
142  | detail::take_until_or_throw(is_cntrl || is_blank)
143  | views::char_to<std::ranges::range_value_t<id_type>>,
144  std::cpp20::back_inserter(id));
145  detail::consume(stream_view | detail::take_line_or_throw);
146  }
147  else
148  {
149  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
150  | detail::take_line_or_throw
151  | views::char_to<std::ranges::range_value_t<id_type>>,
152  std::cpp20::back_inserter(id));
153  }
154  }
155  else
156  {
157  detail::consume(stream_view | detail::take_line_or_throw);
158  }
159  }
160  else if constexpr (!detail::decays_to_ignore_v<id_type>)
161  {
162  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
163  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
164  {
165  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
166  is_id.msg + " and char_is_valid_for<" +
167  detail::type_name_as_string<seq_legal_alph_type> + ">" +
168  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
169  }
170  }
171 
172  // READ SEQUENCE
173  if constexpr (!detail::decays_to_ignore_v<seq_type>)
174  {
175  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
176  std::ranges::copy(stream_view | detail::take_line_or_throw // until end of line
177  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
178  | std::views::transform([is_legal_seq](char const c)
179  {
180  if (!is_legal_seq(c)) // enforce legal alphabet
181  {
182  throw parse_error{std::string{"Encountered an unexpected letter: "} +
183  "char_is_valid_for<" +
184  detail::type_name_as_string<seq_legal_alph_type> +
185  "> evaluated to false on " +
186  detail::make_printable(c)};
187  }
188  return c;
189  })
190  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
191  std::cpp20::back_inserter(seq));
192  }
193  else
194  {
195  detail::consume(stream_view | detail::take_line_or_throw);
196  }
197 
198  // READ STRUCTURE (if present)
199  [[maybe_unused]] int64_t structure_length{};
200  if constexpr (!detail::decays_to_ignore_v<structure_type>)
201  {
202  if constexpr (structured_seq_combined)
203  {
204  assert(std::addressof(seq) == std::addressof(structure));
205  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
206  // We need the structure_length parameter to count the length of the structure while reading
207  // because we cannot infer it from the (already resized) structure_seq object.
208  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
209  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
210 
211  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
212  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
213  }
214  else
215  {
216  using alph_type = std::ranges::range_value_t<structure_type>;
217  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
218  structure_length = std::ranges::distance(structure);
219 
220  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
221  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
222  }
223  }
224  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
225  {
226  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
227  structure_length = std::ranges::distance(bpp);
228  }
229  else
230  {
231  detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
232  }
233 
234  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
235  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
236  {
237  if (std::ranges::distance(seq) != structure_length)
238  throw parse_error{"Found sequence and associated structure of different length."};
239  }
240 
241  // READ ENERGY (if present)
242  if constexpr (!detail::decays_to_ignore_v<energy_type>)
243  {
244  std::string e_str = stream_view | detail::take_line
245  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
246  | views::to<std::string>;
247  if (!e_str.empty())
248  {
249  size_t num_processed;
250  energy = std::stod(e_str, &num_processed);
251  if (num_processed != e_str.size()) // [[unlikely]]
252  {
253  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
254  }
255  }
256  }
257  else
258  {
259  detail::consume(stream_view | detail::take_line);
260  }
261  detail::consume(stream_view | detail::take_until(!is_space));
262  }
263 
265  template <typename stream_type, // constraints checked by file
266  typename seq_type, // other constraints checked inside function
267  typename id_type,
268  typename bpp_type,
269  typename structure_type,
270  typename energy_type,
271  typename react_type,
272  typename comment_type,
273  typename offset_type>
274  void write_structure_record(stream_type & stream,
275  structure_file_output_options const & options,
276  seq_type && seq,
277  id_type && id,
278  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
279  structure_type && structure,
280  energy_type && energy,
281  react_type && SEQAN3_DOXYGEN_ONLY(react),
282  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
283  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
284  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
285  {
286  std::cpp20::ostreambuf_iterator stream_it{stream};
287 
288  // WRITE ID (optional)
289  if constexpr (!detail::decays_to_ignore_v<id_type>)
290  {
291  if (!std::ranges::empty(id))
292  {
293  stream_it = '>';
294  stream_it = ' ';
295  std::ranges::copy(id, stream_it);
296  detail::write_eol(stream_it, options.add_carriage_return);
297  }
298  }
299 
300  // WRITE SEQUENCE
301  if constexpr (!detail::decays_to_ignore_v<seq_type>)
302  {
303  if (std::ranges::empty(seq)) //[[unlikely]]
304  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
305 
306  std::ranges::copy(seq | views::to_char, stream_it);
307  detail::write_eol(stream_it, options.add_carriage_return);
308  }
309  else
310  {
311  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
312  "when writing Vienna files."};
313  }
314 
315  // WRITE STRUCTURE (optional)
316  if constexpr (!detail::decays_to_ignore_v<structure_type>)
317  {
318  if (!std::ranges::empty(structure))
319  std::ranges::copy(structure | views::to_char, stream_it);
320 
321  // WRITE ENERGY (optional)
322  if constexpr (!detail::decays_to_ignore_v<energy_type>)
323  {
324  if (energy)
325  {
326 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
327 // auto [endptr, ec] = std::to_chars(str.data(),
328 // str.data() + str.size(),
329 // energy,
330 // std::chars_format::fixed,
331 // options.precision);
332 // if (ec == std::errc())
333 // std::ranges::copy(str.data(), endptr, stream_it);
334 // else
335 // throw std::runtime_error{"The energy could not be transformed into a string."};
336 
337  stream_it = ' ';
338  stream_it = '(';
339 
340  std::array<char, 100> str;
341  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
342  if (len < 0 || len >= 100)
343  throw std::runtime_error{"The energy could not be transformed into a string."};
344  std::ranges::copy(str.data(), str.data() + len, stream_it);
345 
346  stream_it = ')';
347  }
348  }
349  detail::write_eol(stream_it, options.add_carriage_return);
350  }
351  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
352  {
353  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
354  }
355  }
356 
357 private:
364  template <typename alph_type, typename stream_view_type>
365  auto read_structure(stream_view_type & stream_view)
366  {
367  auto constexpr is_legal_structure = char_is_valid_for<alph_type>;
368  return stream_view | detail::take_until(is_space) // until whitespace
369  | std::views::transform([is_legal_structure](char const c)
370  {
371  if (!is_legal_structure(c))
372  {
373  throw parse_error{
374  std::string{"Encountered an unexpected letter: char_is_valid_for<"} +
375  detail::type_name_as_string<alph_type> +
376  "> evaluated to false on " + detail::make_printable(c)};
377  }
378  return c;
379  }) // enforce legal alphabet
380  | views::char_to<alph_type>; // convert to actual target alphabet
381  }
382 };
383 
384 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:84
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:100
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:119
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
Provides C++20 additions to the <iterator> header.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Definition: affine_cell_proxy.hpp:438
Provides character predicates for tokenisation.
Adaptations of concepts from the Ranges TS.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides C++20 additions to the type_traits header.
Provides the WUSS format for RNA structure.