From 1846c762ed0a08b0f218c919552a9c4e932b6ff3 Mon Sep 17 00:00:00 2001 From: Amos Paribocci Date: Thu, 9 Feb 2023 12:30:16 +0100 Subject: [PATCH] Solving the `Top k most frequent words` problem using a max-heap --- strings/max_k_most_frequent_words.rb | 36 +++++++++++++++++++++++ strings/max_k_most_frequent_words_test.rb | 28 ++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 strings/max_k_most_frequent_words.rb create mode 100644 strings/max_k_most_frequent_words_test.rb diff --git a/strings/max_k_most_frequent_words.rb b/strings/max_k_most_frequent_words.rb new file mode 100644 index 0000000..210fb46 --- /dev/null +++ b/strings/max_k_most_frequent_words.rb @@ -0,0 +1,36 @@ +require_relative '../data_structures/heaps/max_heap' + +## +# This class represents a word count information +# (i.e. how many occurrences for a word). + +class WordCount + include Comparable + + attr_reader :word + attr_reader :occurrences + + def <=>(other) + occurrences <=> other.occurrences + end + + def initialize(word, occurrences) + @word = word + @occurrences = occurrences + end +end + +## +# Returns the `k` most frequently occurring words, in non-increasing order of occurrence. +# In this context, a word is defined as an element in the provided list. +# +# In case `k` is greater than the number of distinct words, a value of `k` equal +# to the number of distinct words will be considered, instead. + +def max_k_most_frequent_words(words, k) + count_by_word = words.tally + heap = MaxHeap.new(count_by_word.map { |w, c| WordCount.new(w, c) }) + most_frequent_words = [] + [k, count_by_word.size].min.times { most_frequent_words.append(heap.extract_max.word) } + most_frequent_words +end \ No newline at end of file diff --git a/strings/max_k_most_frequent_words_test.rb b/strings/max_k_most_frequent_words_test.rb new file mode 100644 index 0000000..a6397fc --- /dev/null +++ b/strings/max_k_most_frequent_words_test.rb @@ -0,0 +1,28 @@ +require 'minitest/autorun' +require_relative 'max_k_most_frequent_words' + +class TestMaxKMostFrequentWords < Minitest::Test + def test_top_3_frequent_words + assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3) == ['c', 'a', 'b'] + end + + def test_top_2_frequent_words + assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2) == ['c', 'a'] + end + + def test_top_frequent_word + assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1) == ['c'] + end + + def test_no_frequent_word_given_zero_k + assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0) == [] + end + + def test_no_frequent_word_given_empty_word_list + assert max_k_most_frequent_words([], 1) == [] + end + + def test_all_frequent_words_given_k_too_large + assert max_k_most_frequent_words(['a', 'a'], 2) == ['a'] + end +end