Solving the Top k most frequent words problem using a max-heap

This commit is contained in:
Amos Paribocci 2023-02-09 12:30:16 +01:00
parent cb556451d2
commit 1846c762ed
2 changed files with 64 additions and 0 deletions

View file

@ -0,0 +1,36 @@
require_relative '../data_structures/heaps/max_heap'
##
# This class represents a word count information
# (i.e. how many occurrences for a word).
class WordCount
include Comparable
attr_reader :word
attr_reader :occurrences
def <=>(other)
occurrences <=> other.occurrences
end
def initialize(word, occurrences)
@word = word
@occurrences = occurrences
end
end
##
# Returns the `k` most frequently occurring words, in non-increasing order of occurrence.
# In this context, a word is defined as an element in the provided list.
#
# In case `k` is greater than the number of distinct words, a value of `k` equal
# to the number of distinct words will be considered, instead.
def max_k_most_frequent_words(words, k)
count_by_word = words.tally
heap = MaxHeap.new(count_by_word.map { |w, c| WordCount.new(w, c) })
most_frequent_words = []
[k, count_by_word.size].min.times { most_frequent_words.append(heap.extract_max.word) }
most_frequent_words
end

View file

@ -0,0 +1,28 @@
require 'minitest/autorun'
require_relative 'max_k_most_frequent_words'
class TestMaxKMostFrequentWords < Minitest::Test
def test_top_3_frequent_words
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3) == ['c', 'a', 'b']
end
def test_top_2_frequent_words
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2) == ['c', 'a']
end
def test_top_frequent_word
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1) == ['c']
end
def test_no_frequent_word_given_zero_k
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0) == []
end
def test_no_frequent_word_given_empty_word_list
assert max_k_most_frequent_words([], 1) == []
end
def test_all_frequent_words_given_k_too_large
assert max_k_most_frequent_words(['a', 'a'], 2) == ['a']
end
end