From 167c5e8188beb5dae002ac7571457e3c26eb6a3f Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Mon, 27 Aug 2018 14:53:25 -0400 Subject: [PATCH] Googletest export Fix Theta(N^2) memory usage of EXPECT_EQ(string) when the strings don't match. The underlying CalculateOptimalEdits() implementation used a simple dynamic-programming approach that always used N^2 memory and time. This meant that tests for equality of large strings were ticking time bombs: They'd work fine as long as the test passed, but as soon as the strings differed the test would OOM, which is very hard to debug. I switched it out for a Dijkstra search, which is still worst-case O(N^2), but in the usual case of mostly-matching strings, it is much closer to linear. PiperOrigin-RevId: 210405025 --- .../include/gtest/internal/gtest-internal.h | 14 +- googletest/src/gtest-all.cc | 3 +- googletest/src/gtest-edit-distance.cc | 507 ++++++++++++++++++ googletest/src/gtest.cc | 244 +-------- googletest/test/gtest_unittest.cc | 77 ++- 5 files changed, 580 insertions(+), 265 deletions(-) create mode 100644 googletest/src/gtest-edit-distance.cc diff --git a/googletest/include/gtest/internal/gtest-internal.h b/googletest/include/gtest/internal/gtest-internal.h index 9593a45a..1c2824dd 100644 --- a/googletest/include/gtest/internal/gtest-internal.h +++ b/googletest/include/gtest/internal/gtest-internal.h @@ -159,15 +159,15 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 #endif // GTEST_HAS_EXCEPTIONS -namespace edit_distance { // Returns the optimal edits to go from 'left' to 'right'. // All edits cost the same, with replace having lower priority than -// add/remove. -// Simple implementation of the Wagner-Fischer algorithm. -// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm -enum EditType { kMatch, kAdd, kRemove, kReplace }; +// add/remove. Returns an approximation of the maximum memory used in +// 'memory_usage' if non-null. +// Uses a Dijkstra search, with a couple of simple bells and whistles added on. +enum EditType { kEditMatch, kEditAdd, kEditRemove, kEditReplace }; GTEST_API_ std::vector CalculateOptimalEdits( - const std::vector& left, const std::vector& right); + const std::vector& left, const std::vector& right, + size_t* memory_usage = NULL); // Same as above, but the input is represented as strings. GTEST_API_ std::vector CalculateOptimalEdits( @@ -179,8 +179,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, const std::vector& right, size_t context = 2); -} // namespace edit_distance - // Calculate the diff between 'left' and 'right' and return it in unified diff // format. // If not null, stores in 'total_line_count' the total number of lines found diff --git a/googletest/src/gtest-all.cc b/googletest/src/gtest-all.cc index b217a180..3363d398 100644 --- a/googletest/src/gtest-all.cc +++ b/googletest/src/gtest-all.cc @@ -38,10 +38,11 @@ #include "gtest/gtest.h" // The following lines pull in the real gtest *.cc files. -#include "src/gtest.cc" #include "src/gtest-death-test.cc" +#include "src/gtest-edit-distance.cc" #include "src/gtest-filepath.cc" #include "src/gtest-port.cc" #include "src/gtest-printers.cc" #include "src/gtest-test-part.cc" #include "src/gtest-typed-test.cc" +#include "src/gtest.cc" diff --git a/googletest/src/gtest-edit-distance.cc b/googletest/src/gtest-edit-distance.cc new file mode 100644 index 00000000..331a3b9b --- /dev/null +++ b/googletest/src/gtest-edit-distance.cc @@ -0,0 +1,507 @@ +// Copyright 2018, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// Internal helper functions for finding optimal edit transformations +// between strings. + +#include "gtest/gtest.h" + +#include +#include +#include // NOLINT +#include +#include + +namespace testing { +namespace internal { +namespace { + +// The following implement data structures and code for a Dijkstra-search +// based implementation of optimal edit distance. + +// Posible states a node can be in. Either a node is unsettled (it hasn't been +// drawn from the priority queue yet), or it is settled and a back-link to its +// parent node is fixed. +enum EditSearchState { + kUnsettled, + kMatchParent, + kAddParent, + kRemoveParent, + kReplaceParent +}; + +// Custom container for search states. This is smaller and faster than a hash +// map, because the used states are dense along diagonals. +// Specifically, each state requires only 1 byte, whereas a hash_map would +// require storing the key, which would come to at least 8 bytes. std::map has +// an extra 32 bytes per node (3 pointers + 1 byte, padded), so even though +// there are circumstances where this class can have kBlockSize overhead per +// state, on average it does better than 40 bytes of overhead per state. +// In addition, in unopt builds (the usual way tests are run) the fewer +// allocations + better locality has this method running 10-50x faster than +// std::map for inputs that are large enough to measure. +class EditSearchMap { + public: + EditSearchMap(size_t left_size, size_t right_size) + : left_size_(left_size), right_size_(right_size) { + GTEST_CHECK_(left_size_ == left_size && right_size_ == right_size) + << "Overflow in size: Arguments too large"; + } + + // Gets a mutable reference to a state - this is actually of type + // EditSearchState - inserting if it does not exist. + unsigned char& insert(UInt32 left, UInt32 right) { + std::vector* vec; + size_t index1; + size_t index2; + if (left > right) { + vec = &left_nodes_; + index1 = left - right - 1; + index2 = right; + } else { + vec = &right_nodes_; + index1 = right - left; + index2 = left; + } + if (vec->size() <= index1) { + GTEST_CHECK_(vec->size() == index1) + << "Array diagonals should only grow by one " << vec->size() << " vs " + << index1; + vec->push_back(block_indices_.size()); + // Round up + block_indices_.resize( + block_indices_.size() + + (DiagonalLength(left, right) + kBlockSize - 1) / kBlockSize, + kUnallocatedBlock); + } + const size_t bucket = index2 / kBlockSize; + const size_t pos_in_bucket = index2 % kBlockSize; + UInt32& level2 = block_indices_[(*vec)[index1] + bucket]; + if (level2 == kUnallocatedBlock) { + level2 = nodes_.size(); + size_t diagonal_length = DiagonalLength(left, right); + GTEST_CHECK_(diagonal_length > index2) + << diagonal_length << " " << index2; + size_t block_size = kBlockSize; + if (diagonal_length / kBlockSize == bucket) { + // We can never get here if diagonal_length is a multiple of + // kBlockSize, which is what we want, since this would evaluate to 0. + block_size = diagonal_length % kBlockSize; + } + nodes_.resize(nodes_.size() + block_size); + } + return nodes_[level2 + pos_in_bucket]; + } + + size_t MemoryUsage() const { + return nodes_.capacity() + + sizeof(UInt32) * (left_nodes_.capacity() + right_nodes_.capacity() + + block_indices_.capacity()); + } + + private: + enum { kBlockSize = 1024, kUnallocatedBlock = 0xFFFFFFFFul }; + + size_t DiagonalLength(UInt32 left, UInt32 right) const { + return std::min(left_size_ - left, right_size_ - right) + + (left < right ? left : right); + } + + // The state space is conceptually a left_size_ by right_size_ sparse matrix + // of EditSearchStates. However, due to the access pattern of the search, it + // is much better to store the nodes per diagonal rather than per row. + UInt32 left_size_; + UInt32 right_size_; + // The nodes are stored by diagonals, split in two: Those to the left of the + // main diagonal are in left_nodes_, and everything else is in right_nodes_. + // The values are indices into block_indices_. + std::vector left_nodes_; + std::vector right_nodes_; + // Every entry here is an offset into the beginning of a kBlockSize-sized + // block in nodes_. An entire diagonal is allocated together here; for a + // diagonal of length <= kBlockSize, that's just a single entry, but for + // longer diagonals multiple contiguous index entries will be reserved at + // once. Unused entries will be assigned kUnallocatedBlock; this + // double-indirect scheme is used to save memory in the cases when an entire + // diagonal isn't needed. + std::vector block_indices_; + // This stores the actual EditSearchState data, pointed to by block_indices_. + std::vector nodes_; +}; + +struct EditHeapEntry { + EditHeapEntry(UInt32 l, UInt32 r, UInt64 c, EditSearchState s) + : left(l), right(r), cost(c), state(s) {} + + UInt32 left; + UInt32 right; + UInt64 cost : 61; + // The state that the node will get when this entry is settled. Therefore, + // this can never be kUnsettled. + UInt64 state : 3; + + bool operator>(const EditHeapEntry& other) const { return cost > other.cost; } +}; + +// Need a min-queue, so invert the comparator. +typedef std::priority_queue, + std::greater> + EditHeap; + +} // namespace + +std::vector CalculateOptimalEdits(const std::vector& left, + const std::vector& right, + size_t* memory_usage) { + const UInt64 kBaseCost = 100000; + // We make replace a little more expensive than add/remove to lower + // their priority. + const UInt64 kReplaceCost = 100001; + // In the common case where the vectors are the same (or almost the same) + // size, we know that an add will have to be followed by some later remove + // (or vice versa) in order to get the lengths to balance. We "borrow" some + // of the cost of the later operation and bring it forward into the earlier + // operation, to increase the cost of exploring (usually fruitlessly) around + // the beginning of the graph. + // However, there is a trade-off: This cheapens the cost of exploring around + // the beginning of the graph (in one direction) when the vectors are + // unequal in length. So we don't steal *all* the cost. + // You can view this as a form of A*, using an admissable heuristic that has + // been re-cast as a cost function that can be used in Dijkstra. + const UInt64 kTowardsGoalCost = 50003; + const UInt64 kAwayFromGoalCost = 2 * kBaseCost - kTowardsGoalCost; + + EditSearchMap node_map(left.size() + 1, right.size() + 1); + EditHeap heap; + heap.push(EditHeapEntry(0, 0, 0, kReplaceParent)); + + while (!heap.empty()) { + const EditHeapEntry current_entry = heap.top(); + heap.pop(); + + UInt32 left_pos = current_entry.left; + UInt32 right_pos = current_entry.right; + unsigned char& current_state = node_map.insert(left_pos, right_pos); + if (current_state != kUnsettled) { + // Node was already settled by a previous entry in the priority queue, + // this is a suboptimal path that should be ignored. + continue; + } + current_state = current_entry.state; + + if (left_pos == left.size() && right_pos == right.size()) { + // This is the normal exit point; if we terminate due to the heap being + // empty, we'll fail a check later. + break; + } + + // Special case: Since the cost of a match is zero, we can immediately + // settle the new node without putting it in the queue, since nothing can + // have a smaller cost than it. Furthermore, we don't need to relax the + // other two edges, since we know we don't need them: Any path from this + // node that would use them has an path via the match that is at least as + // cheap. Together, this means we can loop here until we stop matching. + while (left_pos < left.size() && right_pos < right.size() && + left[left_pos] == right[right_pos]) { + left_pos++; + right_pos++; + unsigned char& fast_forward_state = node_map.insert(left_pos, right_pos); + if (fast_forward_state != kUnsettled) { + // The search reached around and settled this node before settling the + // base node. This means we're completely done with this iteration; + // abort to the outer loop. + goto outer_loop_bottom; + // Otherwise, when can settle this node, even if it was created from + // another state - we know the cost of settling it now is optimal. + } + fast_forward_state = kMatchParent; + } + + // Relax adjacent nodes. We have no way to find or lower the cost of + // existing entries in the heap, so we just push new entries and throw + // them out at the top if the node is already settled. We *could* check to + // see if they're already settled before pushing, but it turns out to be + // ~not any faster, and more complicated to do so. + // + // If we're at an edge, there's only one node to relax. + if (left_pos >= left.size()) { + if (right_pos >= right.size()) { + break; // Can happen due to the fast-path loop above. + } + heap.push(EditHeapEntry(left_pos, right_pos + 1, + current_entry.cost + kTowardsGoalCost, + kAddParent)); + continue; + } + if (right_pos >= right.size()) { + heap.push(EditHeapEntry(left_pos + 1, right_pos, + current_entry.cost + kTowardsGoalCost, + kRemoveParent)); + continue; + } + // General case: Relax 3 edges. + heap.push(EditHeapEntry( + left_pos, right_pos + 1, + current_entry.cost + (right.size() + left_pos > right_pos + left.size() + ? kTowardsGoalCost + : kAwayFromGoalCost), + kAddParent)); + heap.push(EditHeapEntry( + left_pos + 1, right_pos, + current_entry.cost + (right.size() + left_pos < right_pos + left.size() + ? kTowardsGoalCost + : kAwayFromGoalCost), + kRemoveParent)); + heap.push(EditHeapEntry(left_pos + 1, right_pos + 1, + current_entry.cost + kReplaceCost, kReplaceParent)); + outer_loop_bottom : {} // Need the curlies to form a statement. + } + + // Reconstruct the best path. We do it in reverse order. + std::vector best_path; + UInt32 left_pos = left.size(); + UInt32 right_pos = right.size(); + while (left_pos != 0 || right_pos != 0) { + GTEST_CHECK_(left_pos <= left.size() && right_pos <= right.size()); + // The node must already exist, but if it somehow doesn't, it will be + // added as kUnsettled, which will crash below. + const unsigned char state = node_map.insert(left_pos, right_pos); + switch (state) { + case kAddParent: + right_pos--; + break; + case kRemoveParent: + left_pos--; + break; + case kMatchParent: + case kReplaceParent: + left_pos--; + right_pos--; + break; + default: + GTEST_LOG_(FATAL) << "Unsettled node at " << left_pos << "," + << right_pos; + } + best_path.push_back(static_cast(state - 1)); + } + std::reverse(best_path.begin(), best_path.end()); + if (memory_usage != NULL) { + *memory_usage = node_map.MemoryUsage(); + } + return best_path; +} + +namespace { + +// Helper class to convert string into ids with deduplication. +class InternalStrings { + public: + size_t GetId(const std::string* str) { + IdMap::iterator it = ids_.find(str); + if (it != ids_.end()) return it->second; + size_t id = ids_.size(); + return ids_[str] = id; + } + + private: + struct IdMapCmp { + bool operator()(const std::string* first, const std::string* second) const { + return *first < *second; + } + }; + typedef std::map IdMap; + IdMap ids_; +}; + +} // namespace + +std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right) { + std::vector left_ids, right_ids; + { + InternalStrings intern_table; + for (size_t i = 0; i < left.size(); ++i) { + left_ids.push_back(intern_table.GetId(&left[i])); + } + for (size_t i = 0; i < right.size(); ++i) { + right_ids.push_back(intern_table.GetId(&right[i])); + } + } + return CalculateOptimalEdits(left_ids, right_ids); +} + +namespace { + +// Helper class that holds the state for one hunk and prints it out to the +// stream. +// It reorders adds/removes when possible to group all removes before all +// adds. It also adds the hunk header before printing into the stream. +class Hunk { + public: + Hunk(size_t left_start, size_t right_start) + : left_start_(left_start), + right_start_(right_start), + adds_(), + removes_(), + common_() {} + + void PushLine(char edit, const char* line) { + switch (edit) { + case ' ': + ++common_; + FlushEdits(); + hunk_.push_back(std::make_pair(' ', line)); + break; + case '-': + ++removes_; + hunk_removes_.push_back(std::make_pair('-', line)); + break; + case '+': + ++adds_; + hunk_adds_.push_back(std::make_pair('+', line)); + break; + } + } + + void PrintTo(std::ostream* os) { + PrintHeader(os); + FlushEdits(); + for (std::list >::const_iterator it = + hunk_.begin(); + it != hunk_.end(); ++it) { + *os << it->first << it->second << "\n"; + } + } + + bool has_edits() const { return adds_ || removes_; } + + private: + void FlushEdits() { + hunk_.splice(hunk_.end(), hunk_removes_); + hunk_.splice(hunk_.end(), hunk_adds_); + } + + // Print a unified diff header for one hunk. + // The format is + // "@@ -, +, @@" + // where the left/right parts are omitted if unnecessary. + void PrintHeader(std::ostream* ss) const { + *ss << "@@ "; + if (removes_) { + *ss << "-" << left_start_ << "," << (removes_ + common_); + } + if (removes_ && adds_) { + *ss << " "; + } + if (adds_) { + *ss << "+" << right_start_ << "," << (adds_ + common_); + } + *ss << " @@\n"; + } + + size_t left_start_, right_start_; + size_t adds_, removes_, common_; + std::list > hunk_, hunk_adds_, hunk_removes_; +}; + +} // namespace + +// Create a list of diff hunks in Unified diff format. +// Each hunk has a header generated by PrintHeader above plus a body with +// lines prefixed with ' ' for no change, '-' for deletion and '+' for +// addition. +// 'context' represents the desired unchanged prefix/suffix around the diff. +// If two hunks are close enough that their contexts overlap, then they are +// joined into one hunk. +std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context) { + const std::vector edits = CalculateOptimalEdits(left, right); + + size_t l_i = 0, r_i = 0, edit_i = 0; + std::stringstream ss; + while (edit_i < edits.size()) { + // Find first edit. + while (edit_i < edits.size() && edits[edit_i] == kEditMatch) { + ++l_i; + ++r_i; + ++edit_i; + } + + // Find the first line to include in the hunk. + const size_t prefix_context = std::min(l_i, context); + Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); + for (size_t i = prefix_context; i > 0; --i) { + hunk.PushLine(' ', left[l_i - i].c_str()); + } + + // Iterate the edits until we found enough suffix for the hunk or the input + // is over. + size_t n_suffix = 0; + for (; edit_i < edits.size(); ++edit_i) { + if (n_suffix >= context) { + // Continue only if the next hunk is very close. + std::vector::const_iterator it = edits.begin() + edit_i; + while (it != edits.end() && *it == kEditMatch) ++it; + if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { + // There is no next edit or it is too far away. + break; + } + } + + EditType edit = edits[edit_i]; + // Reset count when a non match is found. + n_suffix = edit == kEditMatch ? n_suffix + 1 : 0; + + if (edit == kEditMatch || edit == kEditRemove || edit == kEditReplace) { + hunk.PushLine(edit == kEditMatch ? ' ' : '-', left[l_i].c_str()); + } + if (edit == kEditAdd || edit == kEditReplace) { + hunk.PushLine('+', right[r_i].c_str()); + } + + // Advance indices, depending on edit type. + l_i += edit != kEditAdd; + r_i += edit != kEditRemove; + } + + if (!hunk.has_edits()) { + // We are done. We don't want this hunk. + break; + } + + hunk.PrintTo(&ss); + } + return ss.str(); +} + +} // namespace internal +} // namespace testing diff --git a/googletest/src/gtest.cc b/googletest/src/gtest.cc index 97966377..ba706c1e 100644 --- a/googletest/src/gtest.cc +++ b/googletest/src/gtest.cc @@ -46,7 +46,6 @@ #include #include #include -#include #include #include // NOLINT #include @@ -1068,246 +1067,6 @@ AssertionResult AssertionFailure(const Message& message) { namespace internal { -namespace edit_distance { -std::vector CalculateOptimalEdits(const std::vector& left, - const std::vector& right) { - std::vector > costs( - left.size() + 1, std::vector(right.size() + 1)); - std::vector > best_move( - left.size() + 1, std::vector(right.size() + 1)); - - // Populate for empty right. - for (size_t l_i = 0; l_i < costs.size(); ++l_i) { - costs[l_i][0] = static_cast(l_i); - best_move[l_i][0] = kRemove; - } - // Populate for empty left. - for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) { - costs[0][r_i] = static_cast(r_i); - best_move[0][r_i] = kAdd; - } - - for (size_t l_i = 0; l_i < left.size(); ++l_i) { - for (size_t r_i = 0; r_i < right.size(); ++r_i) { - if (left[l_i] == right[r_i]) { - // Found a match. Consume it. - costs[l_i + 1][r_i + 1] = costs[l_i][r_i]; - best_move[l_i + 1][r_i + 1] = kMatch; - continue; - } - - const double add = costs[l_i + 1][r_i]; - const double remove = costs[l_i][r_i + 1]; - const double replace = costs[l_i][r_i]; - if (add < remove && add < replace) { - costs[l_i + 1][r_i + 1] = add + 1; - best_move[l_i + 1][r_i + 1] = kAdd; - } else if (remove < add && remove < replace) { - costs[l_i + 1][r_i + 1] = remove + 1; - best_move[l_i + 1][r_i + 1] = kRemove; - } else { - // We make replace a little more expensive than add/remove to lower - // their priority. - costs[l_i + 1][r_i + 1] = replace + 1.00001; - best_move[l_i + 1][r_i + 1] = kReplace; - } - } - } - - // Reconstruct the best path. We do it in reverse order. - std::vector best_path; - for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) { - EditType move = best_move[l_i][r_i]; - best_path.push_back(move); - l_i -= move != kAdd; - r_i -= move != kRemove; - } - std::reverse(best_path.begin(), best_path.end()); - return best_path; -} - -namespace { - -// Helper class to convert string into ids with deduplication. -class InternalStrings { - public: - size_t GetId(const std::string& str) { - IdMap::iterator it = ids_.find(str); - if (it != ids_.end()) return it->second; - size_t id = ids_.size(); - return ids_[str] = id; - } - - private: - typedef std::map IdMap; - IdMap ids_; -}; - -} // namespace - -std::vector CalculateOptimalEdits( - const std::vector& left, - const std::vector& right) { - std::vector left_ids, right_ids; - { - InternalStrings intern_table; - for (size_t i = 0; i < left.size(); ++i) { - left_ids.push_back(intern_table.GetId(left[i])); - } - for (size_t i = 0; i < right.size(); ++i) { - right_ids.push_back(intern_table.GetId(right[i])); - } - } - return CalculateOptimalEdits(left_ids, right_ids); -} - -namespace { - -// Helper class that holds the state for one hunk and prints it out to the -// stream. -// It reorders adds/removes when possible to group all removes before all -// adds. It also adds the hunk header before printint into the stream. -class Hunk { - public: - Hunk(size_t left_start, size_t right_start) - : left_start_(left_start), - right_start_(right_start), - adds_(), - removes_(), - common_() {} - - void PushLine(char edit, const char* line) { - switch (edit) { - case ' ': - ++common_; - FlushEdits(); - hunk_.push_back(std::make_pair(' ', line)); - break; - case '-': - ++removes_; - hunk_removes_.push_back(std::make_pair('-', line)); - break; - case '+': - ++adds_; - hunk_adds_.push_back(std::make_pair('+', line)); - break; - } - } - - void PrintTo(std::ostream* os) { - PrintHeader(os); - FlushEdits(); - for (std::list >::const_iterator it = - hunk_.begin(); - it != hunk_.end(); ++it) { - *os << it->first << it->second << "\n"; - } - } - - bool has_edits() const { return adds_ || removes_; } - - private: - void FlushEdits() { - hunk_.splice(hunk_.end(), hunk_removes_); - hunk_.splice(hunk_.end(), hunk_adds_); - } - - // Print a unified diff header for one hunk. - // The format is - // "@@ -, +, @@" - // where the left/right parts are omitted if unnecessary. - void PrintHeader(std::ostream* ss) const { - *ss << "@@ "; - if (removes_) { - *ss << "-" << left_start_ << "," << (removes_ + common_); - } - if (removes_ && adds_) { - *ss << " "; - } - if (adds_) { - *ss << "+" << right_start_ << "," << (adds_ + common_); - } - *ss << " @@\n"; - } - - size_t left_start_, right_start_; - size_t adds_, removes_, common_; - std::list > hunk_, hunk_adds_, hunk_removes_; -}; - -} // namespace - -// Create a list of diff hunks in Unified diff format. -// Each hunk has a header generated by PrintHeader above plus a body with -// lines prefixed with ' ' for no change, '-' for deletion and '+' for -// addition. -// 'context' represents the desired unchanged prefix/suffix around the diff. -// If two hunks are close enough that their contexts overlap, then they are -// joined into one hunk. -std::string CreateUnifiedDiff(const std::vector& left, - const std::vector& right, - size_t context) { - const std::vector edits = CalculateOptimalEdits(left, right); - - size_t l_i = 0, r_i = 0, edit_i = 0; - std::stringstream ss; - while (edit_i < edits.size()) { - // Find first edit. - while (edit_i < edits.size() && edits[edit_i] == kMatch) { - ++l_i; - ++r_i; - ++edit_i; - } - - // Find the first line to include in the hunk. - const size_t prefix_context = std::min(l_i, context); - Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); - for (size_t i = prefix_context; i > 0; --i) { - hunk.PushLine(' ', left[l_i - i].c_str()); - } - - // Iterate the edits until we found enough suffix for the hunk or the input - // is over. - size_t n_suffix = 0; - for (; edit_i < edits.size(); ++edit_i) { - if (n_suffix >= context) { - // Continue only if the next hunk is very close. - std::vector::const_iterator it = edits.begin() + edit_i; - while (it != edits.end() && *it == kMatch) ++it; - if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { - // There is no next edit or it is too far away. - break; - } - } - - EditType edit = edits[edit_i]; - // Reset count when a non match is found. - n_suffix = edit == kMatch ? n_suffix + 1 : 0; - - if (edit == kMatch || edit == kRemove || edit == kReplace) { - hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str()); - } - if (edit == kAdd || edit == kReplace) { - hunk.PushLine('+', right[r_i].c_str()); - } - - // Advance indices, depending on edit type. - l_i += edit != kAdd; - r_i += edit != kRemove; - } - - if (!hunk.has_edits()) { - // We are done. We don't want this hunk. - break; - } - - hunk.PrintTo(&ss); - } - return ss.str(); -} - -} // namespace edit_distance - namespace { // The string representation of the values received in EqFailure() are already @@ -1379,8 +1138,7 @@ AssertionResult EqFailure(const char* lhs_expression, const std::vector rhs_lines = SplitEscapedString(rhs_value); if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { - msg << "\nWith diff:\n" - << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); + msg << "\nWith diff:\n" << CreateUnifiedDiff(lhs_lines, rhs_lines); } } diff --git a/googletest/test/gtest_unittest.cc b/googletest/test/gtest_unittest.cc index f7213fbf..7f6a2ac4 100644 --- a/googletest/test/gtest_unittest.cc +++ b/googletest/test/gtest_unittest.cc @@ -215,6 +215,7 @@ using testing::GTEST_FLAG(stream_result_to); using testing::GTEST_FLAG(throw_on_failure); using testing::IsNotSubstring; using testing::IsSubstring; +using testing::kMaxStackTraceDepth; using testing::Message; using testing::ScopedFakeTestPartResultReporter; using testing::StaticAssertTypeEq; @@ -234,16 +235,18 @@ using testing::internal::AlwaysTrue; using testing::internal::AppendUserMessage; using testing::internal::ArrayAwareFind; using testing::internal::ArrayEq; +using testing::internal::CalculateOptimalEdits; using testing::internal::CodePointToUtf8; using testing::internal::CompileAssertTypesEqual; using testing::internal::CopyArray; using testing::internal::CountIf; +using testing::internal::CreateUnifiedDiff; +using testing::internal::EditType; using testing::internal::EqFailure; using testing::internal::FloatingPoint; using testing::internal::ForEach; using testing::internal::FormatEpochTimeInMillisAsIso8601; using testing::internal::FormatTimeInMillisAsSeconds; -using testing::internal::GTestFlagSaver; using testing::internal::GetCurrentOsStackTraceExceptTop; using testing::internal::GetElementOr; using testing::internal::GetNextRandomSeed; @@ -252,6 +255,7 @@ using testing::internal::GetTestTypeId; using testing::internal::GetTimeInMillis; using testing::internal::GetTypeId; using testing::internal::GetUnitTestImpl; +using testing::internal::GTestFlagSaver; using testing::internal::ImplicitlyConvertible; using testing::internal::Int32; using testing::internal::Int32FromEnvOrDie; @@ -259,6 +263,8 @@ using testing::internal::IsAProtocolMessage; using testing::internal::IsContainer; using testing::internal::IsContainerTest; using testing::internal::IsNotContainer; +using testing::internal::kMaxRandomSeed; +using testing::internal::kTestTypeIdInGoogleTest; using testing::internal::NativeArray; using testing::internal::OsStackTraceGetter; using testing::internal::OsStackTraceGetterInterface; @@ -280,12 +286,6 @@ using testing::internal::TestResultAccessor; using testing::internal::UInt32; using testing::internal::UnitTestImpl; using testing::internal::WideStringToUtf8; -using testing::internal::edit_distance::CalculateOptimalEdits; -using testing::internal::edit_distance::CreateUnifiedDiff; -using testing::internal::edit_distance::EditType; -using testing::internal::kMaxRandomSeed; -using testing::internal::kTestTypeIdInGoogleTest; -using testing::kMaxStackTraceDepth; #if GTEST_HAS_STREAM_REDIRECTION using testing::internal::CaptureStdout; @@ -3517,14 +3517,14 @@ TEST(EditDistance, TestCases) { {__LINE__, "ABCD", "abcd", "////", "@@ -1,4 +1,4 @@\n-A\n-B\n-C\n-D\n+a\n+b\n+c\n+d\n"}, // Path finding. - {__LINE__, "ABCDEFGH", "ABXEGH1", " -/ - +", + {__LINE__, "ABCDEFGH", "ABXEGH1", " /- - +", "@@ -1,8 +1,7 @@\n A\n B\n-C\n-D\n+X\n E\n-F\n G\n H\n+1\n"}, - {__LINE__, "AAAABCCCC", "ABABCDCDC", "- / + / ", - "@@ -1,9 +1,9 @@\n-A\n A\n-A\n+B\n A\n B\n C\n+D\n C\n-C\n+D\n C\n"}, - {__LINE__, "ABCDE", "BCDCD", "- +/", + {__LINE__, "AAAABCCCC", "ABABCDCDC", " -/ + / ", + "@@ -1,9 +1,9 @@\n A\n-A\n-A\n+B\n A\n B\n C\n+D\n C\n-C\n+D\n C\n"}, + {__LINE__, "ABCDE", "BCDCD", "- /+", "@@ -1,5 +1,5 @@\n-A\n B\n C\n D\n-E\n+C\n+D\n"}, - {__LINE__, "ABCDEFGHIJKL", "BCDCDEFGJKLJK", "- ++ -- ++", - "@@ -1,4 +1,5 @@\n-A\n B\n+C\n+D\n C\n D\n" + {__LINE__, "ABCDEFGHIJKL", "BGDCDEFGJKLJK", "- ++ -- ++", + "@@ -1,4 +1,5 @@\n-A\n B\n+G\n+D\n C\n D\n" "@@ -6,7 +7,7 @@\n F\n G\n-H\n-I\n J\n K\n L\n+J\n+K\n"}, {}}; for (const Case* c = kCases; c->left; ++c) { @@ -3542,6 +3542,57 @@ TEST(EditDistance, TestCases) { } } +// Tests that we can run CalculateOptimalEdits for a large vector, i.e. we can +// compute diffs for large strings. +TEST(EditDistance, LargeVectorWithDiffs) { + const int kSize = 300000; + std::vector left; + std::vector right; + std::vector expected(kSize, testing::internal::kEditMatch); + + left.reserve(kSize); + right.reserve(kSize); + for (int i = 0; i < kSize; ++i) { + // Make the contents of the vectors unique. This greatly speeds up + // the algorithm, since it doesn't spend time finding matches for + // different alignments. + left.push_back(i); + right.push_back(i); + } + + for (int i = 0; i < 10; ++i) { + right[i] = kSize + i; + expected[i] = testing::internal::kEditReplace; + right[kSize - i - 1] = kSize * 2 + i; + expected[kSize - i - 1] = testing::internal::kEditReplace; + } + size_t memory_usage; + EXPECT_EQ(CalculateOptimalEdits(left, right, &memory_usage), expected); + EXPECT_GT(memory_usage, kSize); + EXPECT_LT(memory_usage, kSize * 2); +} + +// Tests that we can run CalculateOptimalEdits for two vectors N and M, where +// M = N plus additional junk at the end. The current algorithm only does O(M) +// "real" work in this case, but allocates some extra memory. We test that this +// is still fast enough for common cases, and we aren't allocating an +// excessive amount of extra memory. +TEST(EditDistance, LargeVectorWithTrailingJunk) { + const int kSize = 200000; + const int kAdditionalSize = 2000; + std::vector left(kSize, 0); + std::vector right(kSize + kAdditionalSize, 0); + std::vector expected(kSize + kAdditionalSize, + testing::internal::kEditMatch); + for (int i = 0; i < kAdditionalSize; ++i) { + expected[i + kSize] = testing::internal::kEditAdd; + } + size_t memory_usage; + EXPECT_EQ(CalculateOptimalEdits(left, right, &memory_usage), expected); + EXPECT_GT(memory_usage, kSize); + EXPECT_LT(memory_usage, 6000000); +} + // Tests EqFailure(), used for implementing *EQ* assertions. TEST(AssertionTest, EqFailure) { const std::string foo_val("5"), bar_val("6");