Skip to content

Instantly share code, notes, and snippets.

@antoine
Last active March 20, 2017 02:04
Show Gist options
  • Save antoine/c0ede64f000cf316ace367bbd9cf524c to your computer and use it in GitHub Desktop.
Save antoine/c0ede64f000cf316ace367bbd9cf524c to your computer and use it in GitHub Desktop.
-module(index).
-export([get_file_contents/1,
show_file_contents/1,
index_file/1,
index_sample_files/0]).
-include_lib("eunit/include/eunit.hrl").
index_sample_files() ->
io:format("~p~n", [index_file("gettysburg-address.txt")]),
io:format("~p~n", [index_file("dickens-christmas.txt")]).
commonWords() ->
["on", "the", "that"].
%%computes the index of each word and sort
%%the output according to the words themselves
index_file(Name) ->
Words = words(get_file_contents(Name)),
lists:sort(fun({W1,_},{W2,_}) -> W1=<W2 end, collate(index_words(1,Words, Words))).
%%format the output for each word
collate([])->
[];
collate([{Word, [N|Ns]}|Words])->
[{Word, lists:reverse(collate_lines(N,N,Ns))}|collate(Words)].
%%convert a sorted list of line numbers into the output
%%required by the exercise
collate_lines(Begin,Current, []) ->
[{Current,Begin}];
collate_lines(Begin,Current, [N|Ns]) ->
case Current-N of
1 -> collate_lines(Begin,N,Ns);
_ -> [{Current,Begin}|collate_lines(N,N,Ns)]
end.
%%clean up the text and split each line into words
words([]) ->
[];
words([[]|Lines]) ->
[[]|words(Lines)];
words([Line|Lines]) ->
[split_on_space(nocaps(nopunc(Line)), [])|words(Lines)].
%%for each word of sufficient length and not found in the list
%%of common words collect the line numbers it appears in
%%and remove it from the rest of the text
index_words(_LNb, [[]], _AllLines)->
[];
index_words(LNb, [[]|Lines], AllLines)->
index_words(LNb, Lines, AllLines);
index_words(LNb, [[Word|Line]|Lines], AllLines)->
case member(Word, commonWords()) or (length(Word)=<2) of
true -> index_words(LNb, [Line|Lines], AllLines);
false -> %%io:format("working on ~p~n", [Word]),
[index_word(LNb, Word, AllLines, []) | index_words(
LNb,
remove_word_lines(Word, [Line|Lines]),
remove_word_lines(Word,AllLines))]
end.
%%collect the line number of the lines where Word appear
index_word(_LNb, Word, [], MatchedLines) ->
{Word, MatchedLines};
index_word(LNb, Word, [Line|Lines], MatchedLines) ->
case member(Word, Line) of
true -> index_word(LNb+1, Word, Lines, [LNb|MatchedLines]);
false -> index_word(LNb+1, Word, Lines, MatchedLines)
end.
%%remove a Word from a matrix of words, working Line by Line
remove_word_lines(_W,[]) ->
[];
remove_word_lines(W,[Line|Lines]) ->
[remove_word_line(W,Line)|remove_word_lines(W, Lines)].
%%remove a Word from a Line
remove_word_line(_W,[])->
[];
remove_word_line(W,[W|Line])->
remove_word_line(W,Line);
remove_word_line(W,[OtherWord|Line])->
[OtherWord|remove_word_line(W, Line)].
remove_word_lines_test() ->
[["B"], ["C","D"], []] = remove_word_lines("A", [["A", "B"], ["C","D"], ["A", "A"]]).
%%split a string into a list of words
split_on_space([], []) ->
[];
split_on_space([], Word) ->
[lists:reverse(Word)];
split_on_space([C|Cs], Word) ->
case member(C, " ") of
true -> case Word of
[] -> split_on_space(Cs, []);
_ -> [lists:reverse(Word)|split_on_space(Cs, [])]
end;
false -> split_on_space(Cs, [C|Word])
end.
nopunc([])->[];
nopunc([C|Cs]) ->
case member(C, ",-.\\'()[]`\"") of
true -> nopunc(Cs);
false -> [C|nopunc(Cs)]
end.
nocaps([]) ->
[];
nocaps([X|Xs]) ->
[ nocap(X) | nocaps(Xs) ].
nocap(X) ->
case $A =< X andalso X =< $Z of
true ->
X+32;
false ->
X
end.
member(X, [X|_Xs]) ->
true;
member(_, []) ->
false;
member(C,[_|Xs])->
member(C,Xs).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment