Skip to content

Commit 854061e

Browse files
newrengitster
authored andcommitted
grep: prefetch necessary blobs
In partial clones, `git grep` fetches necessary blobs on-demand one at a time, which can be very slow. In partial clones, add an extra preliminary walk over the tree similar to grep_tree() which collects the blobs of interest, and then prefetches them. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 463c1bf commit 854061e

2 files changed

Lines changed: 201 additions & 0 deletions

File tree

builtin/grep.c

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
#include "object-file.h"
2929
#include "object-name.h"
3030
#include "odb.h"
31+
#include "oid-array.h"
32+
#include "oidset.h"
3133
#include "packfile.h"
3234
#include "pager.h"
3335
#include "path.h"
36+
#include "promisor-remote.h"
3437
#include "read-cache-ll.h"
3538
#include "write-or-die.h"
3639

@@ -692,6 +695,144 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
692695
return hit;
693696
}
694697

698+
static void collect_blob_oids_for_tree(struct repository *repo,
699+
const struct pathspec *pathspec,
700+
struct tree_desc *tree,
701+
struct strbuf *base,
702+
int tn_len,
703+
struct oidset *blob_oids)
704+
{
705+
struct name_entry entry;
706+
int old_baselen = base->len;
707+
struct strbuf name = STRBUF_INIT;
708+
enum interesting match = entry_not_interesting;
709+
710+
while (tree_entry(tree, &entry)) {
711+
if (match != all_entries_interesting) {
712+
strbuf_addstr(&name, base->buf + tn_len);
713+
match = tree_entry_interesting(repo->index,
714+
&entry, &name,
715+
pathspec);
716+
strbuf_reset(&name);
717+
718+
if (match == all_entries_not_interesting)
719+
break;
720+
if (match == entry_not_interesting)
721+
continue;
722+
}
723+
724+
strbuf_add(base, entry.path, tree_entry_len(&entry));
725+
726+
if (S_ISREG(entry.mode)) {
727+
if (!odb_has_object(repo->objects, &entry.oid, 0))
728+
oidset_insert(blob_oids, &entry.oid);
729+
} else if (S_ISDIR(entry.mode)) {
730+
enum object_type type;
731+
struct tree_desc sub_tree;
732+
void *data;
733+
unsigned long size;
734+
735+
data = odb_read_object(repo->objects, &entry.oid,
736+
&type, &size);
737+
if (!data)
738+
die(_("unable to read tree (%s)"),
739+
oid_to_hex(&entry.oid));
740+
741+
strbuf_addch(base, '/');
742+
init_tree_desc(&sub_tree, &entry.oid, data, size);
743+
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
744+
base, tn_len, blob_oids);
745+
free(data);
746+
}
747+
/*
748+
* ...no else clause for S_ISGITLINK: submodules have their
749+
* own promisor configuration and would need separate fetches
750+
* anyway.
751+
*/
752+
753+
strbuf_setlen(base, old_baselen);
754+
}
755+
756+
strbuf_release(&name);
757+
}
758+
759+
static void collect_blob_oids_for_treeish(struct grep_opt *opt,
760+
const struct pathspec *pathspec,
761+
const struct object_id *tree_ish_oid,
762+
const char *name,
763+
struct oidset *blob_oids)
764+
{
765+
struct tree_desc tree;
766+
void *data;
767+
unsigned long size;
768+
struct strbuf base = STRBUF_INIT;
769+
int len;
770+
771+
data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
772+
OBJ_TREE, &size, NULL);
773+
774+
if (!data)
775+
return;
776+
777+
len = name ? strlen(name) : 0;
778+
if (len) {
779+
strbuf_add(&base, name, len);
780+
strbuf_addch(&base, ':');
781+
}
782+
init_tree_desc(&tree, tree_ish_oid, data, size);
783+
784+
collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
785+
&base, base.len, blob_oids);
786+
787+
strbuf_release(&base);
788+
free(data);
789+
}
790+
791+
static void prefetch_grep_blobs(struct grep_opt *opt,
792+
const struct pathspec *pathspec,
793+
const struct object_array *list)
794+
{
795+
struct oidset blob_oids = OIDSET_INIT;
796+
797+
/* Exit if we're not in a partial clone */
798+
if (!repo_has_promisor_remote(opt->repo))
799+
return;
800+
801+
/* For each tree, gather the blobs in it */
802+
for (int i = 0; i < list->nr; i++) {
803+
struct object *real_obj;
804+
805+
obj_read_lock();
806+
real_obj = deref_tag(opt->repo, list->objects[i].item,
807+
NULL, 0);
808+
obj_read_unlock();
809+
810+
if (real_obj &&
811+
(real_obj->type == OBJ_COMMIT ||
812+
real_obj->type == OBJ_TREE))
813+
collect_blob_oids_for_treeish(opt, pathspec,
814+
&real_obj->oid,
815+
list->objects[i].name,
816+
&blob_oids);
817+
}
818+
819+
/* Prefetch the blobs we found */
820+
if (oidset_size(&blob_oids)) {
821+
struct oid_array to_fetch = OID_ARRAY_INIT;
822+
struct oidset_iter iter;
823+
const struct object_id *oid;
824+
825+
oidset_iter_init(&blob_oids, &iter);
826+
while ((oid = oidset_iter_next(&iter)))
827+
oid_array_append(&to_fetch, oid);
828+
829+
promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
830+
831+
oid_array_clear(&to_fetch);
832+
}
833+
oidset_clear(&blob_oids);
834+
}
835+
695836
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
696837
struct object *obj, const char *name, const char *path)
697838
{
@@ -732,6 +873,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
732873
int hit = 0;
733874
const unsigned int nr = list->nr;
734875

876+
prefetch_grep_blobs(opt, pathspec, list);
877+
735878
for (i = 0; i < nr; i++) {
736879
struct object *real_obj;
737880

t/t7810-grep.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,4 +1929,62 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
19291929
test_cmp expected actual
19301930
'
19311931

1932+
test_expect_success 'grep of revision in partial clone batches prefetch and honors pathspec' '
1933+
test_when_finished "rm -rf grep-partial-src grep-partial" &&
1934+
1935+
git init grep-partial-src &&
1936+
(
1937+
cd grep-partial-src &&
1938+
git config uploadpack.allowfilter 1 &&
1939+
git config uploadpack.allowanysha1inwant 1 &&
1940+
mkdir a b &&
1941+
echo "needle in haystack" >a/matches.txt &&
1942+
echo "nothing to see here" >a/nomatch.txt &&
1943+
echo "needle again" >b/matches.md &&
1944+
git add . &&
1945+
git commit -m "initial"
1946+
) &&
1947+
1948+
git clone --no-checkout --filter=blob:none \
1949+
"file://$(pwd)/grep-partial-src" grep-partial &&
1950+
1951+
# All three blobs are missing immediately after a blobless clone.
1952+
git -C grep-partial rev-list --quiet --objects \
1953+
--missing=print HEAD >missing &&
1954+
test_line_count = 3 missing &&
1955+
1956+
# A pathspec-limited grep should prefetch only the two blobs
1957+
# in a/. It should fetch both blobs in one batched request.
1958+
GIT_TRACE2_EVENT="$(pwd)/grep-trace-pathspec" \
1959+
git -C grep-partial grep -c "needle" HEAD -- "a/*.txt" >result &&
1960+
1961+
# Only a/matches.txt contains "needle" among the matched paths.
1962+
test_line_count = 1 result &&
1963+
1964+
# Exactly the two a/*.txt blobs should have been requested, and
1965+
# the server packed those two objects in the response.
1966+
test_trace2_data promisor fetch_count 2 <grep-trace-pathspec &&
1967+
test_trace2_data pack-objects written 2 <grep-trace-pathspec &&
1968+
1969+
# b/matches.md should still be missing locally.
1970+
git -C grep-partial rev-list --quiet --objects \
1971+
--missing=print HEAD >missing &&
1972+
test_line_count = 1 missing &&
1973+
1974+
# A second grep without a pathspec must recurse into both
1975+
# subdirectories, but should request only the still-missing blob
1976+
# from the promisor.
1977+
GIT_TRACE2_EVENT="$(pwd)/grep-trace-all" \
1978+
git -C grep-partial grep -c "needle" HEAD >result &&
1979+
1980+
test_line_count = 2 result &&
1981+
test_trace2_data promisor fetch_count 1 <grep-trace-all &&
1982+
test_trace2_data pack-objects written 1 <grep-trace-all &&
1983+
1984+
# Everything is local now.
1985+
git -C grep-partial rev-list --quiet --objects \
1986+
--missing=print HEAD >missing &&
1987+
test_line_count = 0 missing
1988+
'
1989+
19321990
test_done

0 commit comments

Comments
 (0)