mirror of
https://github.com/TheAlgorithms/Ruby
synced 2024-11-16 19:50:00 +01:00
Solving the Top k most frequent words
problem using a max-heap
This commit is contained in:
parent
cb556451d2
commit
1846c762ed
2 changed files with 64 additions and 0 deletions
36
strings/max_k_most_frequent_words.rb
Normal file
36
strings/max_k_most_frequent_words.rb
Normal file
|
@ -0,0 +1,36 @@
|
|||
require_relative '../data_structures/heaps/max_heap'
|
||||
|
||||
##
|
||||
# This class represents a word count information
|
||||
# (i.e. how many occurrences for a word).
|
||||
|
||||
class WordCount
|
||||
include Comparable
|
||||
|
||||
attr_reader :word
|
||||
attr_reader :occurrences
|
||||
|
||||
def <=>(other)
|
||||
occurrences <=> other.occurrences
|
||||
end
|
||||
|
||||
def initialize(word, occurrences)
|
||||
@word = word
|
||||
@occurrences = occurrences
|
||||
end
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the `k` most frequently occurring words, in non-increasing order of occurrence.
|
||||
# In this context, a word is defined as an element in the provided list.
|
||||
#
|
||||
# In case `k` is greater than the number of distinct words, a value of `k` equal
|
||||
# to the number of distinct words will be considered, instead.
|
||||
|
||||
def max_k_most_frequent_words(words, k)
|
||||
count_by_word = words.tally
|
||||
heap = MaxHeap.new(count_by_word.map { |w, c| WordCount.new(w, c) })
|
||||
most_frequent_words = []
|
||||
[k, count_by_word.size].min.times { most_frequent_words.append(heap.extract_max.word) }
|
||||
most_frequent_words
|
||||
end
|
28
strings/max_k_most_frequent_words_test.rb
Normal file
28
strings/max_k_most_frequent_words_test.rb
Normal file
|
@ -0,0 +1,28 @@
|
|||
require 'minitest/autorun'
|
||||
require_relative 'max_k_most_frequent_words'
|
||||
|
||||
class TestMaxKMostFrequentWords < Minitest::Test
|
||||
def test_top_3_frequent_words
|
||||
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3) == ['c', 'a', 'b']
|
||||
end
|
||||
|
||||
def test_top_2_frequent_words
|
||||
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2) == ['c', 'a']
|
||||
end
|
||||
|
||||
def test_top_frequent_word
|
||||
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1) == ['c']
|
||||
end
|
||||
|
||||
def test_no_frequent_word_given_zero_k
|
||||
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0) == []
|
||||
end
|
||||
|
||||
def test_no_frequent_word_given_empty_word_list
|
||||
assert max_k_most_frequent_words([], 1) == []
|
||||
end
|
||||
|
||||
def test_all_frequent_words_given_k_too_large
|
||||
assert max_k_most_frequent_words(['a', 'a'], 2) == ['a']
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue