From 6a89cd2cdaad35bee1568fc440a296ddb4a6c366 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A1bi=C3=A1n=20Gergely?= <gergely.fabian@moveoneinc.com>
Date: Tue, 6 Mar 2012 16:42:56 +0100
Subject: [PATCH] Performance improvements for git repo parsing

Parse a revision for a given branch, just if
 we haven't parsed it for any branches before.
Moved the db check to for existing revisions into a grouped search.
 Search for many revisions at once: this reduces db load.
 Revisions are grouped into sets of 100.
  This is to improve memory consumption.
 There will be just one query instead of each 100.
The above two methods significantly increase parsing speed.
 Test case was a git repo with 6000+ commits on a master branch,
 and several other branches originating for master.
 Speed improved from 1.4h to 18min.

script/repository_fetch_changesets added for fetching changesets from console.
---
 app/models/repository/git.rb       |   62 +++++++++++++++++++++++++++++-------
 script/repository_fetch_changesets |   23 +++++++++++++
 2 files changed, 73 insertions(+), 12 deletions(-)
 create mode 100755 script/repository_fetch_changesets

diff --git a/app/models/repository/git.rb b/app/models/repository/git.rb
index 586a6e5..23a8e19 100644
--- a/app/models/repository/git.rb
+++ b/app/models/repository/git.rb
@@ -144,27 +144,65 @@ class Repository::Git < Repository
       merge_extra_info(h)
       self.save
     end
+    # Remember what revisions we already processed (in any branches)
+    all_revisions = []
     scm_brs.each do |br1|
       br = br1.to_s
       from_scmid = nil
       from_scmid = h["branches"][br]["last_scmid"] if h["branches"][br]
       h["branches"][br] ||= {}
-      scm.revisions('', from_scmid, br, {:reverse => true}) do |rev|
-        db_rev = find_changeset_by_name(rev.revision)
+
+      revisions = scm.revisions('', from_scmid, br, {:reverse => true})
+      next if revisions.nil? || revisions.empty?
+
+      # Remember the last commit id here, before we start removing revisions from the array.
+      # We'll do that for optimization, but it also means, that we may lose even all revisions.
+      last_revision  = revisions.last
+
+      # remove revisions that we have already processed (possibly in other branches)
+      revisions.reject!{|r| all_revisions.include?(r.scmid)}
+      # add revisions that we are to parse now to 'all processed revisions'
+      # (this equals to a union, because we executed diff above)
+      all_revisions += revisions.map{|r| r.scmid}
+
+      # Make the search for existing revisions in the database in a more sufficient manner
+      #  This is replacing the one-after-one queries.
+      #  Find all revisions, that are in the database, and then remove them from the revision array.
+      #  Then later we won't need any conditions for db existence.
+      # Query for several revisions at once, and remove them from the revisions array, if they are there.
+      # Do this in chunks, to avoid eventual memory problems (in case of tens of thousands of commits).
+      # If there are no revisions (because the original code's algoritm filtered them), then this part will be stepped over.
+      #  We make queries, just if there is any revision.
+      limit = 100
+      offset = 0
+      revisions_copy = revisions.clone # revisions will change
+      while offset < revisions_copy.size
+        recent_changesets_slice = changesets.find(:all, :conditions => ['scmid IN (?)', revisions_copy.slice(offset, limit).map{|x| x.scmid}])
+        # Subtract revisions that redmine already knows about
+        recent_revisions = recent_changesets_slice.map{|c| c.scmid}
+        revisions.reject!{|r| recent_revisions.include?(r.scmid)}
+        offset += limit
+      end
+
+      revisions.each do |rev|
         transaction do
-          if db_rev.nil?
-            db_saved_rev = save_revision(rev)
-            parents = {}
-            parents[db_saved_rev] = rev.parents unless rev.parents.nil?
-            parents.each do |ch, chparents|
-              ch.parents = chparents.collect{|rp| find_changeset_by_name(rp)}.compact
-            end
+          # There is no search in the db for this revision, because above we ensured, that it's not in the db.
+          db_saved_rev = save_revision(rev)
+          parents = {}
+          parents[db_saved_rev] = rev.parents unless rev.parents.nil?
+          parents.each do |ch, chparents|
+            ch.parents = chparents.collect{|rp| find_changeset_by_name(rp)}.compact
           end
-          h["branches"][br]["last_scmid"] = rev.scmid
-          merge_extra_info(h)
-          self.save
+          # saving the last scmid was moved from here, because we won't come in here, if the revision was already added for another branch
         end
       end
+
+      # save the data about the last revision for this branch
+      if last_revision
+        h["branches"][br]["last_scmid"] = last_revision.scmid
+        merge_extra_info(h)
+        self.save
+      end
     end
   end
 
diff --git a/script/repository_fetch_changesets b/script/repository_fetch_changesets
new file mode 100755
index 0000000..9c8f1d1
--- /dev/null
+++ b/script/repository_fetch_changesets
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+require File.expand_path('../../config/environment',  __FILE__)
+
+if ARGV.size == 0
+  puts "No argument given"
+  exit
+end
+
+if ARGV[0] == 'last'
+  rep = Repository.find(:first, :order => 'id DESC')
+else
+  rep = Repository.find(ARGV[0])
+end
+
+unless rep
+  puts "No repository found"
+  exit
+end
+
+p rep
+time = Time.now
+rep.fetch_changesets
+puts "Run in "+(Time.now - time).to_s+" seconds."
\ No newline at end of file
-- 
1.7.4.1

